Skip to content

Commit

Permalink
Merge pull request #318 from 00pauln00/arm-crc32c
Browse files Browse the repository at this point in the history
Arm crc32c
  • Loading branch information
00pauln00 authored Apr 2, 2024
2 parents 2b9e459 + e56bd3d commit eabab4f
Show file tree
Hide file tree
Showing 8 changed files with 737 additions and 13 deletions.
18 changes: 16 additions & 2 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,21 @@ my_include_HEADERS =

ACLOCAL_AMFLAGS = -I m4

if ARCH_X86
ARCH_SOURCES = src/contrib/crc32c-pcl-intel-asm_64.S \
src/contrib/crct10dif-pcl-asm_64.S
else
if ARCH_ARM
ARCH_SOURCES = src/contrib/crc32c_arm64.c \
src/contrib/crct10dif-ce-arm64.S
AM_CFLAGS += -march=armv8-a+crc+crypto
AM_CCASFLAGS = -march=armv8-a+aes+crc+crypto

else
ARCH_SOURCES =
endif
endif

CORE_HDRS = src/include/atomic.h \
src/include/alloc.h \
src/include/binary_hist.h \
Expand Down Expand Up @@ -58,9 +73,8 @@ CORE_SOURCES = $(CORE_HDRS) \
src/alloc.c \
src/buffer.c \
src/config_token.c \
src/contrib/crc32c-pcl-intel-asm_64.S \
src/contrib/crct10dif-pcl-asm_64.S \
src/contrib/crc24q.c \
$(ARCH_SOURCES) \
src/contrib/dlmalloc.c \
src/ctl_interface.c \
src/ctl_interface_cmd.c \
Expand Down
53 changes: 53 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ AC_TYPE_UINT16_T
AC_TYPE_UINT32_T
AC_TYPE_UINT64_T

AC_CANONICAL_HOST
AS_CASE(["$host_cpu"],
[arm*|aarch64*], [arm=true],
[i?86|x86_64], [x86=true]
)
AM_CONDITIONAL([ARCH_ARM], [test x$arm = xtrue])
AM_CONDITIONAL([ARCH_X86], [test x$x86 = xtrue])

AC_HEADER_STDC
AC_CHECK_HEADERS([pthread.h], [],
[AC_MSG_ERROR([failed to locate pthread.h])], [])
Expand Down Expand Up @@ -141,6 +149,51 @@ AC_CHECK_LIB([rocksdb],[rocksdb_checkpoint_object_destroy],,
# restore the original LIBS
LIBS=$LIBS_save

if [test x$arm == xtrue] ; then
AC_MSG_CHECKING([for 64-bit PMULL support])
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdio.h>
#include <stdint.h>
#include <sys/auxv.h>
#include <asm/hwcap.h>
]], [[
#if defined(__aarch64__)
unsigned long hwcap = getauxval(AT_HWCAP);
if (hwcap & HWCAP_PMULL)
return 0;
#endif
return -1;
]])
], [
AC_DEFINE([HAVE_PMULL64], [1], [Define as 1 if you have 64b PMULL support])
AC_MSG_RESULT([yes])
AM_CFLAGS="$AM_CFLAGS -DHAVE_PMULL64"
], [
AC_MSG_RESULT([no])
])

AC_MSG_CHECKING([if little-endian])
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdio.h>
]], [[
#if defined(__aarch64__)
union {
unsigned int i;
char c[sizeof(unsigned int)];
} x;
x.i = 1;
return x.c[0];
#endif
return -1;
]])
], [
AC_DEFINE([ARM_LE], [1], [System is little endian])
AC_MSG_RESULT([yes])
], [
AC_MSG_RESULT([no])
])
fi

AC_ARG_ENABLE(
[asan],
[AS_HELP_STRING([--enable-asan],[address sanitizer build])],
Expand Down
112 changes: 112 additions & 0 deletions src/contrib/crc32c_arm64.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// Copyright 2017 The CRC32C Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

// In a separate source file to allow this accelerated CRC32C function to be
// compiled with the appropriate compiler flags to enable ARM NEON CRC32C
// instructions.

// This implementation is based on https://github.com/google/leveldb/pull/490.


#include <stddef.h>

#include <arm_acle.h>
#include <arm_neon.h>

#define KBYTES 1032
#define SEGMENTBYTES 256

// compute 8bytes for each segment parallelly
#define CRC32C32BYTES(P, IND) \
do { \
crc1 = __crc32cd( \
crc1, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 1 + (IND))); \
crc2 = __crc32cd( \
crc2, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 2 + (IND))); \
crc3 = __crc32cd( \
crc3, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 3 + (IND))); \
crc0 = __crc32cd( \
crc0, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 0 + (IND))); \
} while (0);

// compute 8*8 bytes for each segment parallelly
#define CRC32C256BYTES(P, IND) \
do { \
CRC32C32BYTES((P), (IND)*8 + 0) \
CRC32C32BYTES((P), (IND)*8 + 1) \
CRC32C32BYTES((P), (IND)*8 + 2) \
CRC32C32BYTES((P), (IND)*8 + 3) \
CRC32C32BYTES((P), (IND)*8 + 4) \
CRC32C32BYTES((P), (IND)*8 + 5) \
CRC32C32BYTES((P), (IND)*8 + 6) \
CRC32C32BYTES((P), (IND)*8 + 7) \
} while (0);

// compute 4*8*8 bytes for each segment parallelly
#define CRC32C1024BYTES(P) \
do { \
CRC32C256BYTES((P), 0) \
CRC32C256BYTES((P), 1) \
CRC32C256BYTES((P), 2) \
CRC32C256BYTES((P), 3) \
(P) += 4 * SEGMENTBYTES; \
} while (0)

// niova-core uses linux convention of 0 xor
#define kCRC32Xor 0x0
uint32_t crc32_arm(uint32_t crc, const uint8_t *data, size_t size) {
int64_t length = size;
uint32_t crc0, crc1, crc2, crc3;
uint64_t t0, t1, t2;

// k0=CRC(x^(3*SEGMENTBYTES*8)), k1=CRC(x^(2*SEGMENTBYTES*8)),
// k2=CRC(x^(SEGMENTBYTES*8))
const poly64_t k0 = 0x8d96551c, k1 = 0xbd6f81f8, k2 = 0xdcb17aa4;

crc = crc ^ kCRC32Xor;

while (length >= KBYTES) {
crc0 = crc;
crc1 = 0;
crc2 = 0;
crc3 = 0;

// Process 1024 bytes in parallel.
CRC32C1024BYTES(data);

// Merge the 4 partial CRC32C values.
t2 = (uint64_t)vmull_p64(crc2, k2);
t1 = (uint64_t)vmull_p64(crc1, k1);
t0 = (uint64_t)vmull_p64(crc0, k0);
crc = __crc32cd(crc3, *(uint64_t *)data);
data += sizeof(uint64_t);
crc ^= __crc32cd(0, t2);
crc ^= __crc32cd(0, t1);
crc ^= __crc32cd(0, t0);

length -= KBYTES;
}

while (length >= 8) {
crc = __crc32cd(crc, *(uint64_t *)data);
data += 8;
length -= 8;
}

if (length & 4) {
crc = __crc32cw(crc, *(uint32_t *)data);
data += 4;
}

if (length & 2) {
crc = __crc32ch(crc, *(uint16_t *)data);
data += 2;
}

if (length & 1) {
crc = __crc32cb(crc, *data);
}

return crc ^ kCRC32Xor;
}
Loading

0 comments on commit eabab4f

Please sign in to comment.