-
Notifications
You must be signed in to change notification settings - Fork 38
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
arm64: crypto: add NEON accelerated XOR implementation
This is a NEON acceleration method that can improve performance by approximately 20%. I got the following data from the centos 7.5 on Huawei's HISI1616 chip: [ 93.837726] xor: measuring software checksum speed [ 93.874039] 8regs : 7123.200 MB/sec [ 93.914038] 32regs : 7180.300 MB/sec [ 93.954043] arm64_neon: 9856.000 MB/sec [ 93.954047] xor: using function: arm64_neon (9856.000 MB/sec) I believe this code can bring some optimization for all arm64 platform. thanks for Ard Biesheuvel's suggestions. Signed-off-by: Jackie Liu <[email protected]> Reviewed-by: Ard Biesheuvel <[email protected]> Signed-off-by: Will Deacon <[email protected]> (cherry picked from commit 440ac74221bf9848d61b4ec3cd2ebcdac484aa3e) (cherry picked from commit 1be90ad0ac37b65cc7fa42974e07b34d5ed085ac) (cherry picked from commit a0ec831645f5d3a87b1a2ecadde1cdf5a2120ece) (cherry picked from commit 7317312505e6e74efe2acaa36fb971a8413fc07f) (cherry picked from commit 3f93e27666a3666dd16d1f7291c644aae7deb45b) (cherry picked from commit 0e2b445afd97e0388e6ea3a0e0df6b08a36c8bb1) (cherry picked from commit 79a55a62150afd2f69ecde4d36ccea7ccb4eec71) (cherry picked from commit 983072822abb9f8e786eeed01f206958971f2a16)
- Loading branch information
1 parent
69829a7
commit df5b8a2
Showing
4 changed files
with
263 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,4 +46,3 @@ generic-y += types.h | |
generic-y += unaligned.h | ||
generic-y += user.h | ||
generic-y += vga.h | ||
generic-y += xor.h |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
/* | ||
* arch/arm64/include/asm/xor.h | ||
* | ||
* Authors: Jackie Liu <[email protected]> | ||
* Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. | ||
* | ||
* This program is free software; you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License version 2 as | ||
* published by the Free Software Foundation. | ||
*/ | ||
|
||
#include <linux/hardirq.h> | ||
#include <asm-generic/xor.h> | ||
#include <asm/hwcap.h> | ||
#include <asm/neon.h> | ||
|
||
#ifdef CONFIG_KERNEL_MODE_NEON | ||
|
||
extern struct xor_block_template const xor_block_inner_neon; | ||
|
||
static void | ||
xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
{ | ||
kernel_neon_begin(); | ||
xor_block_inner_neon.do_2(bytes, p1, p2); | ||
kernel_neon_end(); | ||
} | ||
|
||
static void | ||
xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
unsigned long *p3) | ||
{ | ||
kernel_neon_begin(); | ||
xor_block_inner_neon.do_3(bytes, p1, p2, p3); | ||
kernel_neon_end(); | ||
} | ||
|
||
static void | ||
xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
unsigned long *p3, unsigned long *p4) | ||
{ | ||
kernel_neon_begin(); | ||
xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4); | ||
kernel_neon_end(); | ||
} | ||
|
||
static void | ||
xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
{ | ||
kernel_neon_begin(); | ||
xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5); | ||
kernel_neon_end(); | ||
} | ||
|
||
static struct xor_block_template xor_block_arm64 = { | ||
.name = "arm64_neon", | ||
.do_2 = xor_neon_2, | ||
.do_3 = xor_neon_3, | ||
.do_4 = xor_neon_4, | ||
.do_5 = xor_neon_5 | ||
}; | ||
#undef XOR_TRY_TEMPLATES | ||
#define XOR_TRY_TEMPLATES \ | ||
do { \ | ||
xor_speed(&xor_block_8regs); \ | ||
xor_speed(&xor_block_32regs); \ | ||
if (cpu_has_neon()) { \ | ||
xor_speed(&xor_block_arm64);\ | ||
} \ | ||
} while (0) | ||
|
||
#endif /* ! CONFIG_KERNEL_MODE_NEON */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
/* | ||
* arch/arm64/lib/xor-neon.c | ||
* | ||
* Authors: Jackie Liu <[email protected]> | ||
* Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. | ||
* | ||
* This program is free software; you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License version 2 as | ||
* published by the Free Software Foundation. | ||
*/ | ||
|
||
#include <linux/raid/xor.h> | ||
#include <linux/module.h> | ||
#include <asm/neon-intrinsics.h> | ||
|
||
void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1, | ||
unsigned long *p2) | ||
{ | ||
uint64_t *dp1 = (uint64_t *)p1; | ||
uint64_t *dp2 = (uint64_t *)p2; | ||
|
||
register uint64x2_t v0, v1, v2, v3; | ||
long lines = bytes / (sizeof(uint64x2_t) * 4); | ||
|
||
do { | ||
/* p1 ^= p2 */ | ||
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); | ||
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); | ||
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); | ||
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); | ||
|
||
/* store */ | ||
vst1q_u64(dp1 + 0, v0); | ||
vst1q_u64(dp1 + 2, v1); | ||
vst1q_u64(dp1 + 4, v2); | ||
vst1q_u64(dp1 + 6, v3); | ||
|
||
dp1 += 8; | ||
dp2 += 8; | ||
} while (--lines > 0); | ||
} | ||
|
||
void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1, | ||
unsigned long *p2, unsigned long *p3) | ||
{ | ||
uint64_t *dp1 = (uint64_t *)p1; | ||
uint64_t *dp2 = (uint64_t *)p2; | ||
uint64_t *dp3 = (uint64_t *)p3; | ||
|
||
register uint64x2_t v0, v1, v2, v3; | ||
long lines = bytes / (sizeof(uint64x2_t) * 4); | ||
|
||
do { | ||
/* p1 ^= p2 */ | ||
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); | ||
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); | ||
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); | ||
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); | ||
|
||
/* p1 ^= p3 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); | ||
|
||
/* store */ | ||
vst1q_u64(dp1 + 0, v0); | ||
vst1q_u64(dp1 + 2, v1); | ||
vst1q_u64(dp1 + 4, v2); | ||
vst1q_u64(dp1 + 6, v3); | ||
|
||
dp1 += 8; | ||
dp2 += 8; | ||
dp3 += 8; | ||
} while (--lines > 0); | ||
} | ||
|
||
void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1, | ||
unsigned long *p2, unsigned long *p3, unsigned long *p4) | ||
{ | ||
uint64_t *dp1 = (uint64_t *)p1; | ||
uint64_t *dp2 = (uint64_t *)p2; | ||
uint64_t *dp3 = (uint64_t *)p3; | ||
uint64_t *dp4 = (uint64_t *)p4; | ||
|
||
register uint64x2_t v0, v1, v2, v3; | ||
long lines = bytes / (sizeof(uint64x2_t) * 4); | ||
|
||
do { | ||
/* p1 ^= p2 */ | ||
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); | ||
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); | ||
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); | ||
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); | ||
|
||
/* p1 ^= p3 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); | ||
|
||
/* p1 ^= p4 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); | ||
|
||
/* store */ | ||
vst1q_u64(dp1 + 0, v0); | ||
vst1q_u64(dp1 + 2, v1); | ||
vst1q_u64(dp1 + 4, v2); | ||
vst1q_u64(dp1 + 6, v3); | ||
|
||
dp1 += 8; | ||
dp2 += 8; | ||
dp3 += 8; | ||
dp4 += 8; | ||
} while (--lines > 0); | ||
} | ||
|
||
void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, | ||
unsigned long *p2, unsigned long *p3, | ||
unsigned long *p4, unsigned long *p5) | ||
{ | ||
uint64_t *dp1 = (uint64_t *)p1; | ||
uint64_t *dp2 = (uint64_t *)p2; | ||
uint64_t *dp3 = (uint64_t *)p3; | ||
uint64_t *dp4 = (uint64_t *)p4; | ||
uint64_t *dp5 = (uint64_t *)p5; | ||
|
||
register uint64x2_t v0, v1, v2, v3; | ||
long lines = bytes / (sizeof(uint64x2_t) * 4); | ||
|
||
do { | ||
/* p1 ^= p2 */ | ||
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); | ||
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); | ||
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); | ||
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); | ||
|
||
/* p1 ^= p3 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); | ||
|
||
/* p1 ^= p4 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); | ||
|
||
/* p1 ^= p5 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); | ||
|
||
/* store */ | ||
vst1q_u64(dp1 + 0, v0); | ||
vst1q_u64(dp1 + 2, v1); | ||
vst1q_u64(dp1 + 4, v2); | ||
vst1q_u64(dp1 + 6, v3); | ||
|
||
dp1 += 8; | ||
dp2 += 8; | ||
dp3 += 8; | ||
dp4 += 8; | ||
dp5 += 8; | ||
} while (--lines > 0); | ||
} | ||
|
||
struct xor_block_template const xor_block_inner_neon = { | ||
.name = "__inner_neon__", | ||
.do_2 = xor_arm64_neon_2, | ||
.do_3 = xor_arm64_neon_3, | ||
.do_4 = xor_arm64_neon_4, | ||
.do_5 = xor_arm64_neon_5, | ||
}; | ||
EXPORT_SYMBOL(xor_block_inner_neon); | ||
|
||
MODULE_AUTHOR("Jackie Liu <[email protected]>"); | ||
MODULE_DESCRIPTION("ARMv8 XOR Extensions"); | ||
MODULE_LICENSE("GPL"); |