-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathAVXVecConvolve.pas
133 lines (97 loc) · 5.37 KB
/
AVXVecConvolve.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// ###################################################################
// #### This file is part of the mathematics library project, and is
// #### offered under the licence agreement described on
// #### http://www.mrsoft.org/
// ####
// #### Copyright:(c) 2018, Michael R. . All rights reserved.
// ####
// #### Unless required by applicable law or agreed to in writing, software
// #### distributed under the License is distributed on an "AS IS" BASIS,
// #### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// #### See the License for the specific language governing permissions and
// #### limitations under the License.
// ###################################################################
unit AVXVecConvolve;
interface
{$I 'mrMath_CPU.inc'}
{$IFNDEF x64}
// simple convolution: the input and output parameter are assumed to be vectors!
// it's also assumed that memory before A is accessible for at least bLen elements
// -> these elements are used for the convulution calculation
// -> needs an aligned B and blen mod 2 needs to be zero
procedure AVXVecConvolveRevB(dest : PDouble; A, B : PDouble; aLen, bLen : NativeInt); {$IFDEF FPC} assembler; {$ELSE} register; {$ENDIF}
{$ENDIF}
implementation
{$IFNDEF x64}
procedure AVXVecConvolveRevB(dest : PDouble; A, B : PDouble; aLen, bLen : NativeInt);
// eax = dest, edx = A, ecx = B
asm
push ebx;
push esi;
push edi;
mov esi, bLen;
imul esi, -8;
add edx, 8;
sub ecx, esi;
// one element convolution
@@forxloop:
mov edi, esi;
{$IFDEF AVXSUP}vxorpd ymm0, ymm0, ymm0; {$ELSE}db $C5,$FD,$57,$C0;{$ENDIF}
// unrolled part
@@innerLoopUnrolled:
add edi, 128;
jg @@innerLoopStart;
{$IFDEF AVXSUP}vmovupd ymm1, [edx + edi - 128]; {$ELSE}db $C5,$FD,$10,$4C,$3A,$80;{$ENDIF}
{$IFDEF AVXSUP}vmovapd ymm2, [ecx + edi - 128]; {$ELSE}db $C5,$FD,$28,$54,$39,$80;{$ENDIF}
{$IFDEF AVXSUP}vmulpd ymm1, ymm1, ymm2; {$ELSE}db $C5,$F5,$59,$CA;{$ENDIF}
{$IFDEF AVXSUP}vaddpd ymm0, ymm0, ymm1; {$ELSE}db $C5,$FD,$58,$C1;{$ENDIF}
{$IFDEF AVXSUP}vmovupd ymm3, [edx + edi - 96]; {$ELSE}db $C5,$FD,$10,$5C,$3A,$A0;{$ENDIF}
{$IFDEF AVXSUP}vmovapd ymm4, [ecx + edi - 96]; {$ELSE}db $C5,$FD,$28,$64,$39,$A0;{$ENDIF}
{$IFDEF AVXSUP}vmulpd ymm3, ymm3, ymm4; {$ELSE}db $C5,$E5,$59,$DC;{$ENDIF}
{$IFDEF AVXSUP}vaddpd ymm0, ymm0, ymm3; {$ELSE}db $C5,$FD,$58,$C3;{$ENDIF}
{$IFDEF AVXSUP}vmovupd ymm1, [edx + edi - 64]; {$ELSE}db $C5,$FD,$10,$4C,$3A,$C0;{$ENDIF}
{$IFDEF AVXSUP}vmovapd ymm2, [ecx + edi - 64]; {$ELSE}db $C5,$FD,$28,$54,$39,$C0;{$ENDIF}
{$IFDEF AVXSUP}vmulpd ymm1, ymm1, ymm2; {$ELSE}db $C5,$F5,$59,$CA;{$ENDIF}
{$IFDEF AVXSUP}vaddpd ymm0, ymm0, ymm1; {$ELSE}db $C5,$FD,$58,$C1;{$ENDIF}
{$IFDEF AVXSUP}vmovupd ymm3, [edx + edi - 32]; {$ELSE}db $C5,$FD,$10,$5C,$3A,$E0;{$ENDIF}
{$IFDEF AVXSUP}vmovapd ymm4, [ecx + edi - 32]; {$ELSE}db $C5,$FD,$28,$64,$39,$E0;{$ENDIF}
{$IFDEF AVXSUP}vmulpd ymm3, ymm3, ymm4; {$ELSE}db $C5,$E5,$59,$DC;{$ENDIF}
{$IFDEF AVXSUP}vaddpd ymm0, ymm0, ymm3; {$ELSE}db $C5,$FD,$58,$C3;{$ENDIF}
jmp @@innerLoopUnrolled;
@@innerLoopStart:
sub edi, 128;
jz @@innerLoopEnd;
@@innerLoop:
add edi, 32;
jg @@innerLoop2Start;
{$IFDEF AVXSUP}vmovupd ymm1, [edx + edi - 32]; {$ELSE}db $C5,$FD,$10,$4C,$3A,$E0;{$ENDIF}
{$IFDEF AVXSUP}vmovapd ymm2, [ecx + edi - 32]; {$ELSE}db $C5,$FD,$28,$54,$39,$E0;{$ENDIF}
{$IFDEF AVXSUP}vmulpd ymm1, ymm1, ymm2; {$ELSE}db $C5,$F5,$59,$CA;{$ENDIF}
{$IFDEF AVXSUP}vaddpd ymm0, ymm0, ymm1; {$ELSE}db $C5,$FD,$58,$C1;{$ENDIF}
jmp @@innerLoop;
@@innerLoop2Start:
sub edi, 32;
jz @@innerLoopEnd;
// last two elements
{$IFDEF AVXSUP}vmovupd xmm3, [edx]; {$ELSE}db $C5,$F9,$10,$1A;{$ENDIF}
{$IFDEF AVXSUP}vmovapd xmm4, [ecx]; {$ELSE}db $C5,$F9,$28,$21;{$ENDIF}
{$IFDEF AVXSUP}vmulpd xmm3, xmm3, xmm4; {$ELSE}db $C5,$E1,$59,$DC;{$ENDIF}
{$IFDEF AVXSUP}vaddpd xmm0, xmm0, xmm3; {$ELSE}db $C5,$F9,$58,$C3;{$ENDIF}
@@innerLoopEnd:
{$IFDEF AVXSUP}vextractf128 xmm1, ymm0, 1; {$ELSE}db $C4,$E3,$7D,$19,$C1,$01;{$ENDIF}
{$IFDEF AVXSUP}vaddpd xmm0, xmm0, xmm1; {$ELSE}db $C5,$F9,$58,$C1;{$ENDIF}
{$IFDEF AVXSUP}vhaddpd xmm0, xmm0, xmm0; {$ELSE}db $C5,$F9,$7C,$C0;{$ENDIF}
{$IFDEF AVXSUP}vmovsd [eax], xmm0; {$ELSE}db $C5,$FB,$11,$00;{$ENDIF}
// next element
add eax, 8;
add edx, 8;
dec aLen;
jnz @@forxloop;
// ########################################
// #### epilog
pop edi;
pop esi;
pop ebx;
end;
{$ENDIF}
end.