-
Notifications
You must be signed in to change notification settings - Fork 251
/
galois_arm64.s
115 lines (94 loc) · 2.61 KB
/
galois_arm64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
//+build !noasm
//+build !appengine
//+build !gccgo
//+build !nopshufb
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2017, Minio, Inc.
#include "textflag.h"
#define LOAD(LO1, LO2, HI1, HI2) \
VLD1.P 32(R1), [LO1.B16, LO2.B16] \
\
\ // Get low input and high input
VUSHR $4, LO1.B16, HI1.B16 \
VUSHR $4, LO2.B16, HI2.B16 \
VAND V8.B16, LO1.B16, LO1.B16 \
VAND V8.B16, LO2.B16, LO2.B16
#define GALOIS_MUL(MUL_LO, MUL_HI, OUT1, OUT2, TMP1, TMP2) \
\ // Mul low part and mul high part
VTBL V0.B16, [MUL_LO.B16], OUT1.B16 \
VTBL V10.B16, [MUL_HI.B16], OUT2.B16 \
VTBL V1.B16, [MUL_LO.B16], TMP1.B16 \
VTBL V11.B16, [MUL_HI.B16], TMP2.B16 \
\
\ // Combine results
VEOR OUT2.B16, OUT1.B16, OUT1.B16 \
VEOR TMP2.B16, TMP1.B16, OUT2.B16
// func galMulNEON(low, high, in, out []byte)
TEXT ·galMulNEON(SB), 7, $0
MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message
MOVD out_base+72(FP), R5
SUBS $32, R2
BMI complete
MOVD low+0(FP), R10 // R10: &low
MOVD high+24(FP), R11 // R11: &high
VLD1 (R10), [V6.B16]
VLD1 (R11), [V7.B16]
//
// Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error
// WORD $0x4e010c68 // dup v8.16b, w3
//
MOVD $0x0f, R3
VMOV R3, V8.B[0]
VDUP V8.B[0], V8.B16
loop:
// Main loop
LOAD(V0, V1, V10, V11)
GALOIS_MUL(V6, V7, V4, V5, V14, V15)
// Store result
VST1.P [V4.D2, V5.D2], 32(R5)
SUBS $32, R2
BPL loop
complete:
RET
// func galMulXorNEON(low, high, in, out []byte)
TEXT ·galMulXorNEON(SB), 7, $0
MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message
MOVD out_base+72(FP), R5
SUBS $32, R2
BMI completeXor
MOVD low+0(FP), R10 // R10: &low
MOVD high+24(FP), R11 // R11: &high
VLD1 (R10), [V6.B16]
VLD1 (R11), [V7.B16]
//
// Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error
// WORD $0x4e010c68 // dup v8.16b, w3
//
MOVD $0x0f, R3
VMOV R3, V8.B[0]
VDUP V8.B[0], V8.B16
loopXor:
// Main loop
VLD1 (R5), [V20.B16, V21.B16]
LOAD(V0, V1, V10, V11)
GALOIS_MUL(V6, V7, V4, V5, V14, V15)
VEOR V20.B16, V4.B16, V4.B16
VEOR V21.B16, V5.B16, V5.B16
// Store result
VST1.P [V4.D2, V5.D2], 32(R5)
SUBS $32, R2
BPL loopXor
completeXor:
RET
TEXT ·getVectorLength(SB), NOSPLIT, $0
WORD $0xd2800002 // mov x2, #0
WORD $0x04225022 // addvl x2, x2, #1
WORD $0xd37df042 // lsl x2, x2, #3
WORD $0xd2800003 // mov x3, #0
WORD $0x04635023 // addpl x3, x3, #1
WORD $0xd37df063 // lsl x3, x3, #3
MOVD R2, vl+0(FP)
MOVD R3, pl+8(FP)
RET