-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathASMVecConvolve.pas
187 lines (138 loc) · 4.39 KB
/
ASMVecConvolve.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
// ###################################################################
// #### This file is part of the mathematics library project, and is
// #### offered under the licence agreement described on
// #### http://www.mrsoft.org/
// ####
// #### Copyright:(c) 2018, Michael R. . All rights reserved.
// ####
// #### Unless required by applicable law or agreed to in writing, software
// #### distributed under the License is distributed on an "AS IS" BASIS,
// #### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// #### See the License for the specific language governing permissions and
// #### limitations under the License.
// ###################################################################
unit ASMVecConvolve;
interface
{$I 'mrMath_CPU.inc'}
{$IFNDEF x64}
// simple convolution: the input and output parameter are assumed to be vectors!
// it's also assumed that memory before A is accessible for at least bLen elements
// -> these elements are used for the convolution calculation
// -> needs an aligned B and blen mod 2 needs to be zero
procedure ASMVecConvolveRevB(dest : PDouble; A, B : PDouble; aLen, bLen : NativeInt); {$IFDEF FPC} assembler; {$ELSE} register; {$ENDIF}
// not yet ready...
// does the same as above but input values are skipped dA elements
// A - bLen*dA must be accessible
// procedure ASMVecConvolveRevBEx(dest : PDouble; A, B : PDouble; aLen, bLen : NativeInt; dA : NativeInt); {$IFDEF FPC} assembler; {$ELSE} register; {$ENDIF}
{$ENDIF}
implementation
{$IFNDEF x64}
procedure ASMVecConvolveRevB(dest : PDouble; A, B : PDouble; aLen, bLen : NativeInt);
// eax = dest, edx = A, ecx = B
asm
push ebx;
push esi;
push edi;
mov esi, bLen;
imul esi, -8;
add edx, 8;
sub ecx, esi;
// one element convolution
@@forxloop:
mov edi, esi;
xorpd xmm0, xmm0;
// unrolled part
@@innerLoopUnrolled:
add edi, 64;
jg @@innerLoopStart;
movupd xmm1, [edx + edi - 64];
movapd xmm2, [ecx + edi - 64];
mulpd xmm1, xmm2;
addpd xmm0, xmm1;
movupd xmm3, [edx + edi - 48];
movapd xmm4, [ecx + edi - 48];
mulpd xmm3, xmm4;
addpd xmm0, xmm3;
movupd xmm1, [edx + edi - 32];
movapd xmm2, [ecx + edi - 32];
mulpd xmm1, xmm2;
addpd xmm0, xmm1;
movupd xmm3, [edx + edi - 16];
movapd xmm4, [ecx + edi - 16];
mulpd xmm3, xmm4;
addpd xmm0, xmm3;
jmp @@innerLoopUnrolled;
@@innerLoopStart:
sub edi, 64;
jz @@innerLoopEnd;
@@innerLoop:
movupd xmm1, [edx + edi];
movapd xmm2, [ecx + edi];
mulpd xmm1, xmm2;
addpd xmm0, xmm1;
add edi, 16;
jnz @@innerLoop;
@@innerLoopEnd:
haddpd xmm0, xmm0;
movsd [eax], xmm0;
// next element
add eax, 8;
add edx, 8;
dec aLen;
jnz @@forxloop;
// ########################################
// #### epilog
pop edi;
pop esi;
pop ebx;
end;
procedure ASMVecConvolveRevBEx(dest : PDouble; A, B : PDouble; aLen, bLen : NativeInt; dA : NativeInt); {$IFDEF FPC} assembler; {$ELSE} register; {$ENDIF}
var pA : PDouble;
// eax = dest, edx = A, ecx = B
asm
push ebx;
push esi;
push edi;
mov esi, bLen;
imul esi, -8;
sub ecx, esi;
mov ebx, dA;
dec bLen;
imul ebx, bLen;
shl ebx, 3;
sub edx, ebx;
//add edx, 8;
mov pA, edx;
mov ebx, dA;
shl ebx, 3;
// convolution with stride
@@forxloop:
mov edi, esi;
mov edx, pA;
xorpd xmm0, xmm0;
@@innerLoop:
movsd xmm1, [edx]; // first element to lower part
movhpd xmm1, [edx + ebx]; // next element to higher part
movapd xmm2, [ecx + edi]; // load reversed kernel elements
mulpd xmm1, xmm2;
addpd xmm0, xmm1;
add edi, 16;
lea edx, [edx + 2*ebx];
jnz @@innerLoop;
@@innerLoopEnd:
haddpd xmm0, xmm0;
movsd [eax], xmm0;
// next element
add pA, 8;
add eax, 8;
add edx, 8;
dec aLen;
jnz @@forxloop;
// ########################################
// #### epilog
pop edi;
pop esi;
pop ebx;
end;
{$ENDIF}
end.