-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathFMAVecConvolve.pas
129 lines (93 loc) · 4.97 KB
/
FMAVecConvolve.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
// ###################################################################
// #### This file is part of the mathematics library project, and is
// #### offered under the licence agreement described on
// #### http://www.mrsoft.org/
// ####
// #### Copyright:(c) 2018, Michael R. . All rights reserved.
// ####
// #### Unless required by applicable law or agreed to in writing, software
// #### distributed under the License is distributed on an "AS IS" BASIS,
// #### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// #### See the License for the specific language governing permissions and
// #### limitations under the License.
// ###################################################################
unit FMAVecConvolve;
interface
{$I 'mrMath_CPU.inc'}
{$IFNDEF x64}
// simple convolution: the input and output parameter are assumed to be vectors!
// it's also assumed that memory before A is accessible for at least bLen elements
// -> these elements are used for the convulution calculation
// -> needs an aligned B and blen mod 2 needs to be zero
procedure FMAVecConvolveRevB(dest : PDouble; A, B : PDouble; aLen, bLen : NativeInt); {$IFDEF FPC} assembler; {$ELSE} register; {$ENDIF}
{$ENDIF}
implementation
{$IFNDEF x64}
procedure FMAVecConvolveRevB(dest : PDouble; A, B : PDouble; aLen, bLen : NativeInt);
// eax = dest, edx = A, ecx = B
asm
push ebx;
push esi;
push edi;
mov esi, bLen;
imul esi, -8;
add edx, 8;
sub ecx, esi;
// one element convolution
@@forxloop:
mov edi, esi;
{$IFDEF AVXSUP}vxorpd ymm0, ymm0, ymm0; {$ELSE}db $C5,$FD,$57,$C0;{$ENDIF}
// unrolled part
@@innerLoopUnrolled:
add edi, 128;
jg @@innerLoopStart;
{$IFDEF AVXSUP}vmovupd ymm1, [edx + edi - 128]; {$ELSE}db $C5,$FD,$10,$4C,$3A,$80;{$ENDIF}
{$IFDEF AVXSUP}vmovapd ymm2, [ecx + edi - 128]; {$ELSE}db $C5,$FD,$28,$54,$39,$80;{$ENDIF}
{$IFDEF AVXSUP}vfmadd231pd ymm0, ymm1, ymm2; {$ELSE}db $C4,$E2,$F5,$B8,$C2;{$ENDIF}
{$IFDEF AVXSUP}vmovupd ymm3, [edx + edi - 96]; {$ELSE}db $C5,$FD,$10,$5C,$3A,$A0;{$ENDIF}
{$IFDEF AVXSUP}vmovapd ymm4, [ecx + edi - 96]; {$ELSE}db $C5,$FD,$28,$64,$39,$A0;{$ENDIF}
{$IFDEF AVXSUP}vfmadd231pd ymm0, ymm3, ymm4; {$ELSE}db $C4,$E2,$E5,$B8,$C4;{$ENDIF}
{$IFDEF AVXSUP}vmovupd ymm1, [edx + edi - 64]; {$ELSE}db $C5,$FD,$10,$4C,$3A,$C0;{$ENDIF}
{$IFDEF AVXSUP}vmovapd ymm2, [ecx + edi - 64]; {$ELSE}db $C5,$FD,$28,$54,$39,$C0;{$ENDIF}
{$IFDEF AVXSUP}vfmadd231pd ymm0, ymm1, ymm2; {$ELSE}db $C4,$E2,$F5,$B8,$C2;{$ENDIF}
{$IFDEF AVXSUP}vmovupd ymm3, [edx + edi - 32]; {$ELSE}db $C5,$FD,$10,$5C,$3A,$E0;{$ENDIF}
{$IFDEF AVXSUP}vmovapd ymm4, [ecx + edi - 32]; {$ELSE}db $C5,$FD,$28,$64,$39,$E0;{$ENDIF}
{$IFDEF AVXSUP}vfmadd231pd ymm0, ymm3, ymm4; {$ELSE}db $C4,$E2,$E5,$B8,$C4;{$ENDIF}
jmp @@innerLoopUnrolled;
@@innerLoopStart:
sub edi, 128;
jz @@innerLoopEnd;
@@innerLoop:
add edi, 32;
jg @@innerLoop2Start;
{$IFDEF AVXSUP}vmovupd ymm1, [edx + edi - 32]; {$ELSE}db $C5,$FD,$10,$4C,$3A,$E0;{$ENDIF}
{$IFDEF AVXSUP}vmovapd ymm2, [ecx + edi - 32]; {$ELSE}db $C5,$FD,$28,$54,$39,$E0;{$ENDIF}
{$IFDEF AVXSUP}vfmadd231pd ymm0, ymm1, ymm2; {$ELSE}db $C4,$E2,$F5,$B8,$C2;{$ENDIF}
jmp @@innerLoop;
@@innerLoop2Start:
sub edi, 32;
jz @@innerLoopEnd;
// last two elements
{$IFDEF AVXSUP}vmovupd xmm3, [edx]; {$ELSE}db $C5,$F9,$10,$1A;{$ENDIF}
{$IFDEF AVXSUP}vmovapd xmm4, [ecx]; {$ELSE}db $C5,$F9,$28,$21;{$ENDIF}
{$IFDEF AVXSUP}vmulpd xmm3, xmm3, xmm4; {$ELSE}db $C5,$E1,$59,$DC;{$ENDIF}
{$IFDEF AVXSUP}vaddpd xmm0, xmm0, xmm3; {$ELSE}db $C5,$F9,$58,$C3;{$ENDIF}
@@innerLoopEnd:
{$IFDEF AVXSUP}vextractf128 xmm1, ymm0, 1; {$ELSE}db $C4,$E3,$7D,$19,$C1,$01;{$ENDIF}
{$IFDEF AVXSUP}vaddpd xmm0, xmm0, xmm1; {$ELSE}db $C5,$F9,$58,$C1;{$ENDIF}
{$IFDEF AVXSUP}vhaddpd xmm0, xmm0, xmm0; {$ELSE}db $C5,$F9,$7C,$C0;{$ENDIF}
{$IFDEF AVXSUP}vmovsd [eax], xmm0; {$ELSE}db $C5,$FB,$11,$00;{$ENDIF}
// next element
add eax, 8;
add edx, 8;
dec aLen;
jnz @@forxloop;
// ########################################
// #### epilog
{$IFDEF AVXSUP}vzeroupper; {$ELSE}db $C5,$F8,$77;{$ENDIF}
pop edi;
pop esi;
pop ebx;
end;
{$ENDIF}
end.