-
Notifications
You must be signed in to change notification settings - Fork 0
/
v1.s
166 lines (145 loc) · 3.37 KB
/
v1.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
.text
.globl v1
.intel_syntax noprefix
.align 32
.needle:
.zero 32, 0x2a
v1:
mov rdx, rax
mov r11, 0
vmovaps xmm0, xmmword ptr [rip + .needle]
vmovaps ymm0, ymmword ptr [rip + .needle]
jmp loop_ispc_pragma_x2
# ISPC version
.align 64
loop_og: # 494ms
vmovdqu ymm1, ymmword ptr [rdx]
vpcmpeqb ymm2, ymm1, ymm0
vpmovmskb ecx, ymm2
add rdx, 32
add r11, 32
test ecx, ecx
je loop_og
jmp end
# ISPC version, but without the redundant add
.align 64
loop_fixed: # 494ms
vmovdqu ymm1, ymmword ptr [rdx]
vpcmpeqb ymm2, ymm1, ymm0
vpmovmskb ecx, ymm2
add rdx, 32
test ecx, ecx
je loop_fixed
jmp end
.align 64
loop_ispc: # 483ms
vpcmpeqb ymm2, ymm0, ymmword ptr [rdx + 32]
vpcmpeqb ymm1, ymm0, ymmword ptr [rdx]
vpor ymm3, ymm1, ymm2
add rdx, 64
vpmovmskb ecx, ymm3
test ecx, ecx
je loop_ispc
jmp end
# unrolled 2x, ISPC pragma unroll
.align 64
loop_ispc_pragma_x2: # 481ms
vpcmpeqb ymm1, ymm0, ymmword ptr [rdx]
vpmovmskb ecx, ymm1
test ecx, ecx
jne end
vpcmpeqb ymm1, ymm0, ymmword ptr [rdx + 0x20]
vpmovmskb ecx, ymm1
add r11, 0x40
add rdx, 0x40
test ecx, ecx
je loop_ispc_pragma_x2
# no unrolling, NT load, w/o redundant add
.align 64
loop_nt: # 481ms
vmovntdqa ymm1, ymmword ptr [rdx]
vpcmpeqb ymm2, ymm1, ymm0
vpmovmskb ecx, ymm2
add rdx, 32
test ecx, ecx
je loop_nt
jmp end
# unrolled 2x, NT load, w/o redundant add
.align 64
loop_nt_x2: # 474ms
vmovntdqa ymm1, ymmword ptr [rdx]
vmovntdqa ymm2, ymmword ptr [rdx + 0x20]
vpcmpeqb ymm3, ymm1, ymm0
vpcmpeqb ymm4, ymm2, ymm0
vpor ymm4, ymm4, ymm3
vpmovmskb ecx, ymm4
add rdx, 0x40
test ecx, ecx
je loop_nt_x2
jmp end
# unrolled 4x, NT load, w/o redundant add
.align 64
loop_nt_x4: # 482ms
vmovntdqa ymm1, ymmword ptr [rdx]
vmovntdqa ymm2, ymmword ptr [rdx + 0x20]
vpcmpeqb ymm5, ymm1, ymm0
vpcmpeqb ymm6, ymm2, ymm0
vpor ymm6, ymm6, ymm5
vmovntdqa ymm3, ymmword ptr [rdx + 0x40]
vmovntdqa ymm4, ymmword ptr [rdx + 0x60]
vpcmpeqb ymm7, ymm3, ymm0
vpcmpeqb ymm8, ymm4, ymm0
vpor ymm8, ymm8, ymm7
vpor ymm8, ymm8, ymm6
vpmovmskb ecx, ymm8
add rdx, 0x80
test ecx, ecx
je loop_nt_x4
jmp end
# unrolled 2x, NT load, w/o redundant add, prefetch
.align 64
loop_nt_x2_prefetch: # 467ms
prefetchnta byte ptr [rdx + 0x80]
vmovntdqa ymm1, ymmword ptr [rdx]
vmovntdqa ymm2, ymmword ptr [rdx + 0x20]
vpcmpeqb ymm3, ymm1, ymm0
vpcmpeqb ymm4, ymm2, ymm0
vpor ymm4, ymm4, ymm3
vpmovmskb ecx, ymm4
add rdx, 0x40
test ecx, ecx
je loop_nt_x2_prefetch
jmp end
# tiled access
.align 64
loop_tiled: # 392ms
# temporal load
vmovaps ymm1, ymmword ptr [rdx]
vmovaps ymm2, ymmword ptr [rdx + 0x20]
vpcmpeqb ymm5, ymm1, ymm0
vpcmpeqb ymm6, ymm2, ymm0
vpor ymm6, ymm6, ymm5
vpmovmskb ecx, ymm6
# streaming load
vmovaps ymm3, ymmword ptr [rdx + 0x10000]
vmovaps ymm4, ymmword ptr [rdx + 0x10020]
vpcmpeqb ymm7, ymm3, ymm0
vpcmpeqb ymm8, ymm4, ymm0
vpor ymm8, ymm8, ymm7
vpmovmskb r11, ymm8
add rdx, 0x40
mov r9, 0xffff
and r9, rdx
cmp r9, 0
jne _loop_nt_mix_epi
# increase tile
add rdx, 0x10000
_loop_nt_mix_epi:
or rcx, r11
test ecx, ecx
je loop_tiled
jmp end
end:
sub rdx, rax
mov rax, rdx
ret