Skip to content

Commit

Permalink
arm64: crypto: add NEON accelerated XOR implementation
Browse files Browse the repository at this point in the history
This is a NEON acceleration method that can improve
performance by approximately 20%. I got the following
data from the centos 7.5 on Huawei's HISI1616 chip:

[ 93.837726] xor: measuring software checksum speed
[ 93.874039]   8regs  : 7123.200 MB/sec
[ 93.914038]   32regs : 7180.300 MB/sec
[ 93.954043]   arm64_neon: 9856.000 MB/sec
[ 93.954047] xor: using function: arm64_neon (9856.000 MB/sec)

I believe this code can bring some optimization for
all arm64 platform. thanks for Ard Biesheuvel's suggestions.

Signed-off-by: Jackie Liu <[email protected]>
Reviewed-by: Ard Biesheuvel <[email protected]>
Signed-off-by: Will Deacon <[email protected]>
(cherry picked from commit 440ac74221bf9848d61b4ec3cd2ebcdac484aa3e)
(cherry picked from commit 1be90ad0ac37b65cc7fa42974e07b34d5ed085ac)
(cherry picked from commit a0ec831645f5d3a87b1a2ecadde1cdf5a2120ece)
(cherry picked from commit 7317312505e6e74efe2acaa36fb971a8413fc07f)
(cherry picked from commit 3f93e27666a3666dd16d1f7291c644aae7deb45b)
(cherry picked from commit 0e2b445afd97e0388e6ea3a0e0df6b08a36c8bb1)
(cherry picked from commit 79a55a62150afd2f69ecde4d36ccea7ccb4eec71)
(cherry picked from commit 983072822abb9f8e786eeed01f206958971f2a16)
  • Loading branch information
JackieLiu1 authored and PainKiller3 committed Jul 7, 2022
1 parent 69829a7 commit df5b8a2
Show file tree
Hide file tree
Showing 4 changed files with 263 additions and 1 deletion.
1 change: 0 additions & 1 deletion arch/arm64/include/asm/Kbuild
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,3 @@ generic-y += types.h
generic-y += unaligned.h
generic-y += user.h
generic-y += vga.h
generic-y += xor.h
73 changes: 73 additions & 0 deletions arch/arm64/include/asm/xor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* arch/arm64/include/asm/xor.h
*
* Authors: Jackie Liu <[email protected]>
* Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/

#include <linux/hardirq.h>
#include <asm-generic/xor.h>
#include <asm/hwcap.h>
#include <asm/neon.h>

#ifdef CONFIG_KERNEL_MODE_NEON

extern struct xor_block_template const xor_block_inner_neon;

static void
xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
kernel_neon_begin();
xor_block_inner_neon.do_2(bytes, p1, p2);
kernel_neon_end();
}

static void
xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
kernel_neon_begin();
xor_block_inner_neon.do_3(bytes, p1, p2, p3);
kernel_neon_end();
}

static void
xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
kernel_neon_begin();
xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
kernel_neon_end();
}

static void
xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
kernel_neon_begin();
xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
kernel_neon_end();
}

static struct xor_block_template xor_block_arm64 = {
.name = "arm64_neon",
.do_2 = xor_neon_2,
.do_3 = xor_neon_3,
.do_4 = xor_neon_4,
.do_5 = xor_neon_5
};
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
xor_speed(&xor_block_8regs); \
xor_speed(&xor_block_32regs); \
if (cpu_has_neon()) { \
xor_speed(&xor_block_arm64);\
} \
} while (0)

#endif /* ! CONFIG_KERNEL_MODE_NEON */
6 changes: 6 additions & 0 deletions arch/arm64/lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
strchr.o strrchr.o

ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only
CFLAGS_xor-neon.o += -ffreestanding
endif

# Tell the compiler to treat all general purpose registers (with the
# exception of the IP registers, which are already handled by the caller
# in case of a PLT) as callee-saved, which allows for efficient runtime
Expand Down
184 changes: 184 additions & 0 deletions arch/arm64/lib/xor-neon.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
/*
* arch/arm64/lib/xor-neon.c
*
* Authors: Jackie Liu <[email protected]>
* Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/

#include <linux/raid/xor.h>
#include <linux/module.h>
#include <asm/neon-intrinsics.h>

void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
unsigned long *p2)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;

register uint64x2_t v0, v1, v2, v3;
long lines = bytes / (sizeof(uint64x2_t) * 4);

do {
/* p1 ^= p2 */
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));

/* store */
vst1q_u64(dp1 + 0, v0);
vst1q_u64(dp1 + 2, v1);
vst1q_u64(dp1 + 4, v2);
vst1q_u64(dp1 + 6, v3);

dp1 += 8;
dp2 += 8;
} while (--lines > 0);
}

void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
uint64_t *dp3 = (uint64_t *)p3;

register uint64x2_t v0, v1, v2, v3;
long lines = bytes / (sizeof(uint64x2_t) * 4);

do {
/* p1 ^= p2 */
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));

/* p1 ^= p3 */
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));

/* store */
vst1q_u64(dp1 + 0, v0);
vst1q_u64(dp1 + 2, v1);
vst1q_u64(dp1 + 4, v2);
vst1q_u64(dp1 + 6, v3);

dp1 += 8;
dp2 += 8;
dp3 += 8;
} while (--lines > 0);
}

void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3, unsigned long *p4)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
uint64_t *dp3 = (uint64_t *)p3;
uint64_t *dp4 = (uint64_t *)p4;

register uint64x2_t v0, v1, v2, v3;
long lines = bytes / (sizeof(uint64x2_t) * 4);

do {
/* p1 ^= p2 */
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));

/* p1 ^= p3 */
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));

/* p1 ^= p4 */
v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));

/* store */
vst1q_u64(dp1 + 0, v0);
vst1q_u64(dp1 + 2, v1);
vst1q_u64(dp1 + 4, v2);
vst1q_u64(dp1 + 6, v3);

dp1 += 8;
dp2 += 8;
dp3 += 8;
dp4 += 8;
} while (--lines > 0);
}

void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3,
unsigned long *p4, unsigned long *p5)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
uint64_t *dp3 = (uint64_t *)p3;
uint64_t *dp4 = (uint64_t *)p4;
uint64_t *dp5 = (uint64_t *)p5;

register uint64x2_t v0, v1, v2, v3;
long lines = bytes / (sizeof(uint64x2_t) * 4);

do {
/* p1 ^= p2 */
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));

/* p1 ^= p3 */
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));

/* p1 ^= p4 */
v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));

/* p1 ^= p5 */
v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));

/* store */
vst1q_u64(dp1 + 0, v0);
vst1q_u64(dp1 + 2, v1);
vst1q_u64(dp1 + 4, v2);
vst1q_u64(dp1 + 6, v3);

dp1 += 8;
dp2 += 8;
dp3 += 8;
dp4 += 8;
dp5 += 8;
} while (--lines > 0);
}

struct xor_block_template const xor_block_inner_neon = {
.name = "__inner_neon__",
.do_2 = xor_arm64_neon_2,
.do_3 = xor_arm64_neon_3,
.do_4 = xor_arm64_neon_4,
.do_5 = xor_arm64_neon_5,
};
EXPORT_SYMBOL(xor_block_inner_neon);

MODULE_AUTHOR("Jackie Liu <[email protected]>");
MODULE_DESCRIPTION("ARMv8 XOR Extensions");
MODULE_LICENSE("GPL");

0 comments on commit df5b8a2

Please sign in to comment.