From d5a06841fc4bfa3be3d19f1c4498577aeb6f9b1f Mon Sep 17 00:00:00 2001
From: Kit Westneat <kit.westneat@gmail.com>
Date: Mon, 25 Mar 2024 16:38:31 +0000
Subject: [PATCH 01/13] Add arm crc32c cpp file from google

---
 src/contrib/crc32c_arm64.cc | 123 ++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 src/contrib/crc32c_arm64.cc
diff --git a/src/contrib/crc32c_arm64.cc b/src/contrib/crc32c_arm64.cc
new file mode 100644
index 000000000..2595135f6
--- /dev/null
+++ b/src/contrib/crc32c_arm64.cc
@@ -0,0 +1,123 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "./crc32c_arm64.h"
+
+// In a separate source file to allow this accelerated CRC32C function to be
+// compiled with the appropriate compiler flags to enable ARM NEON CRC32C
+// instructions.
+
+// This implementation is based on https://github.com/google/leveldb/pull/490.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "./crc32c_internal.h"
+#include "crc32c/crc32c_config.h"
+
+#if HAVE_ARM64_CRC32C
+
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+#define KBYTES 1032
+#define SEGMENTBYTES 256
+
+// compute 8bytes for each segment parallelly
+#define CRC32C32BYTES(P, IND)                                             \
+  do {                                                                    \
+    crc1 = __crc32cd(                                                     \
+        crc1, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 1 + (IND))); \
+    crc2 = __crc32cd(                                                     \
+        crc2, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 2 + (IND))); \
+    crc3 = __crc32cd(                                                     \
+        crc3, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 3 + (IND))); \
+    crc0 = __crc32cd(                                                     \
+        crc0, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 0 + (IND))); \
+  } while (0);
+
+// compute 8*8 bytes for each segment parallelly
+#define CRC32C256BYTES(P, IND)      \
+  do {                              \
+    CRC32C32BYTES((P), (IND)*8 + 0) \
+    CRC32C32BYTES((P), (IND)*8 + 1) \
+    CRC32C32BYTES((P), (IND)*8 + 2) \
+    CRC32C32BYTES((P), (IND)*8 + 3) \
+    CRC32C32BYTES((P), (IND)*8 + 4) \
+    CRC32C32BYTES((P), (IND)*8 + 5) \
+    CRC32C32BYTES((P), (IND)*8 + 6) \
+    CRC32C32BYTES((P), (IND)*8 + 7) \
+  } while (0);
+
+// compute 4*8*8 bytes for each segment parallelly
+#define CRC32C1024BYTES(P)   \
+  do {                       \
+    CRC32C256BYTES((P), 0)   \
+    CRC32C256BYTES((P), 1)   \
+    CRC32C256BYTES((P), 2)   \
+    CRC32C256BYTES((P), 3)   \
+    (P) += 4 * SEGMENTBYTES; \
+  } while (0)
+
+namespace crc32c {
+
+uint32_t ExtendArm64(uint32_t crc, const uint8_t *data, size_t size) {
+  int64_t length = size;
+  uint32_t crc0, crc1, crc2, crc3;
+  uint64_t t0, t1, t2;
+
+  // k0=CRC(x^(3*SEGMENTBYTES*8)), k1=CRC(x^(2*SEGMENTBYTES*8)),
+  // k2=CRC(x^(SEGMENTBYTES*8))
+  const poly64_t k0 = 0x8d96551c, k1 = 0xbd6f81f8, k2 = 0xdcb17aa4;
+
+  crc = crc ^ kCRC32Xor;
+
+  while (length >= KBYTES) {
+    crc0 = crc;
+    crc1 = 0;
+    crc2 = 0;
+    crc3 = 0;
+
+    // Process 1024 bytes in parallel.
+    CRC32C1024BYTES(data);
+
+    // Merge the 4 partial CRC32C values.
+    t2 = (uint64_t)vmull_p64(crc2, k2);
+    t1 = (uint64_t)vmull_p64(crc1, k1);
+    t0 = (uint64_t)vmull_p64(crc0, k0);
+    crc = __crc32cd(crc3, *(uint64_t *)data);
+    data += sizeof(uint64_t);
+    crc ^= __crc32cd(0, t2);
+    crc ^= __crc32cd(0, t1);
+    crc ^= __crc32cd(0, t0);
+
+    length -= KBYTES;
+  }
+
+  while (length >= 8) {
+    crc = __crc32cd(crc, *(uint64_t *)data);
+    data += 8;
+    length -= 8;
+  }
+
+  if (length & 4) {
+    crc = __crc32cw(crc, *(uint32_t *)data);
+    data += 4;
+  }
+
+  if (length & 2) {
+    crc = __crc32ch(crc, *(uint16_t *)data);
+    data += 2;
+  }
+
+  if (length & 1) {
+    crc = __crc32cb(crc, *data);
+  }
+
+  return crc ^ kCRC32Xor;
+}
+
+}  // namespace crc32c
+
+#endif  // HAVE_ARM64_CRC32C

From c53f9bd893cbc89f25b6e55030a7743ad181a5c2 Mon Sep 17 00:00:00 2001
From: Kit Westneat <kit.westneat@gmail.com>
Date: Mon, 25 Mar 2024 16:40:06 +0000
Subject: [PATCH 02/13] Pull kCRC32Xor constant out of header

---
 src/contrib/crc32c_arm64.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/contrib/crc32c_arm64.cc b/src/contrib/crc32c_arm64.cc
index 2595135f6..3457aba8d 100644
--- a/src/contrib/crc32c_arm64.cc
+++ b/src/contrib/crc32c_arm64.cc
@@ -13,7 +13,6 @@
 #include <cstddef>
 #include <cstdint>
 
-#include "./crc32c_internal.h"
 #include "crc32c/crc32c_config.h"
 
 #if HAVE_ARM64_CRC32C
@@ -62,6 +61,7 @@
 
 namespace crc32c {
 
+static constexpr const uint32_t kCRC32Xor = static_cast<uint32_t>(0xffffffffU);
 uint32_t ExtendArm64(uint32_t crc, const uint8_t *data, size_t size) {
   int64_t length = size;
   uint32_t crc0, crc1, crc2, crc3;

From 9a0509eb2b125a054fb3f7f5876814c35581b1c7 Mon Sep 17 00:00:00 2001
From: Kit Westneat <kit.westneat@gmail.com>
Date: Mon, 25 Mar 2024 16:43:45 +0000
Subject: [PATCH 03/13] Remove unused headers from arm crc32c, convert to C and
 add to build. modify niova core to use niova_crc wrapper

---
 Makefile.am                                     | 15 +++++++++++++--
 configure.ac                                    |  7 +++++++
 src/contrib/{crc32c_arm64.cc => crc32c_arm64.c} | 16 ++--------------
 src/include/crc32.h                             |  9 ++++++++-
 src/pumice_db.c                                 |  2 +-
 src/raft_server.c                               |  2 +-
 6 files changed, 32 insertions(+), 19 deletions(-)
 rename src/contrib/{crc32c_arm64.cc => crc32c_arm64.c} (92%)

diff --git a/Makefile.am b/Makefile.am
index ee8676667..253bc5f28 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -13,6 +13,18 @@ my_include_HEADERS =
 
 ACLOCAL_AMFLAGS = -I m4
 
+if ARCH_X86
+ARCH_SOURCES = src/contrib/crc32c-pcl-intel-asm_64.S \
+        src/contrib/crct10dif-pcl-asm_64.S
+else
+if ARCH_ARM
+ARCH_SOURCES = src/contrib/crc32c_arm64.c
+AM_CFLAGS += -march=armv8-a+crc+crypto
+else
+ARCH_SOURCES =
+endif
+endif
+
 CORE_HDRS = src/include/atomic.h \
 	src/include/alloc.h \
         src/include/binary_hist.h \
@@ -58,9 +70,8 @@ CORE_SOURCES = $(CORE_HDRS) \
 	src/alloc.c \
         src/buffer.c \
         src/config_token.c \
-        src/contrib/crc32c-pcl-intel-asm_64.S \
-        src/contrib/crct10dif-pcl-asm_64.S \
         src/contrib/crc24q.c \
+        $(ARCH_SOURCES) \
         src/contrib/dlmalloc.c \
         src/ctl_interface.c \
         src/ctl_interface_cmd.c \
diff --git a/configure.ac b/configure.ac
index 8bc77b3bc..60e70bc9d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -197,4 +197,11 @@ AC_ARG_ENABLE(
 
 AC_CONFIG_FILES([niova.pc])
 
+AC_CANONICAL_HOST
+AS_CASE(["$host_cpu"],
+    [arm*|aarch64*], [arm=true],
+    [i?86|x86_64], [x86=true]
+)
+AM_CONDITIONAL([ARCH_ARM], [test x$arm = xtrue])
+AM_CONDITIONAL([ARCH_X86], [test x$x86 = xtrue])
 AC_OUTPUT
diff --git a/src/contrib/crc32c_arm64.cc b/src/contrib/crc32c_arm64.c
similarity index 92%
rename from src/contrib/crc32c_arm64.cc
rename to src/contrib/crc32c_arm64.c
index 3457aba8d..42b5246f9 100644
--- a/src/contrib/crc32c_arm64.cc
+++ b/src/contrib/crc32c_arm64.c
@@ -2,20 +2,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "./crc32c_arm64.h"
-
 // In a separate source file to allow this accelerated CRC32C function to be
 // compiled with the appropriate compiler flags to enable ARM NEON CRC32C
 // instructions.
 
 // This implementation is based on https://github.com/google/leveldb/pull/490.
 
-#include <cstddef>
-#include <cstdint>
-
-#include "crc32c/crc32c_config.h"
 
-#if HAVE_ARM64_CRC32C
+#include <stddef.h>
 
 #include <arm_acle.h>
 #include <arm_neon.h>
@@ -59,10 +53,8 @@
     (P) += 4 * SEGMENTBYTES; \
   } while (0)
 
-namespace crc32c {
-
 static constexpr const uint32_t kCRC32Xor = static_cast<uint32_t>(0xffffffffU);
-uint32_t ExtendArm64(uint32_t crc, const uint8_t *data, size_t size) {
+uint32_t crc32_arm(uint32_t crc, const uint8_t *data, size_t size) {
   int64_t length = size;
   uint32_t crc0, crc1, crc2, crc3;
   uint64_t t0, t1, t2;
@@ -117,7 +109,3 @@ uint32_t ExtendArm64(uint32_t crc, const uint8_t *data, size_t size) {
 
   return crc ^ kCRC32Xor;
 }
-
-}  // namespace crc32c
-
-#endif  // HAVE_ARM64_CRC32C
diff --git a/src/include/crc32.h b/src/include/crc32.h
index 5dd129c31..b389bda4f 100644
--- a/src/include/crc32.h
+++ b/src/include/crc32.h
@@ -17,9 +17,16 @@ crc_pcl(const unsigned char *buffer, int len, unsigned int crc_init);
 
 extern uint16_t
 crc_t10dif_pcl(uint16_t crc_init, const unsigned char *buffer, size_t len);
-#endif
 
 #define niova_crc crc_pcl
 #define niova_t10dif_crc crc_t10dif_pcl
 
+#elif defined(__aarch64__)
+extern uint32_t
+crc32_arm(uint32_t crc, const uint8_t *data, size_t size);
+
+#define niova_crc(buf, len, init) crc32_arm(init, buf, len)
+
+#endif
+
 #endif
diff --git a/src/pumice_db.c b/src/pumice_db.c
index a5d72975f..28e57b654 100644
--- a/src/pumice_db.c
+++ b/src/pumice_db.c
@@ -198,7 +198,7 @@ pmdb_obj_crc_calc(struct pmdb_object *obj)
     const int crc_len = sizeof(struct pmdb_object) - offset;
     NIOVA_ASSERT(crc_len >= 0);
 
-    obj->pmdb_obj_crc = crc_pcl(buf, crc_len, 0);
+    obj->pmdb_obj_crc = niova_crc(buf, crc_len, 0);
 }
 
 static void
diff --git a/src/raft_server.c b/src/raft_server.c
index 5b4d9a4ce..6de17df10 100644
--- a/src/raft_server.c
+++ b/src/raft_server.c
@@ -643,7 +643,7 @@ raft_server_entry_calc_crc(const struct raft_entry *re)
     const int crc_len = sizeof(struct raft_entry) + rh->reh_data_size - offset;
     NIOVA_ASSERT(crc_len >= 0);
 
-    crc32_t crc = crc_pcl(buf, crc_len, 0);
+    crc32_t crc = niova_crc(buf, crc_len, 0);
 
     DBG_RAFT_ENTRY(((rh->reh_crc && crc != rh->reh_crc) ? LL_WARN : LL_DEBUG),
                    &re->re_header, "calculated crc=%u", crc);

From b11f02b8590e8429caeba2d5ff8a2aa9cd98db2c Mon Sep 17 00:00:00 2001
From: Kit Westneat <kit.westneat@gmail.com>
Date: Mon, 25 Mar 2024 16:45:39 +0000
Subject: [PATCH 04/13] Modify arm crc32c XOR constant to match x86 version

---
 src/contrib/crc32c_arm64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/contrib/crc32c_arm64.c b/src/contrib/crc32c_arm64.c
index 42b5246f9..8a8c45169 100644
--- a/src/contrib/crc32c_arm64.c
+++ b/src/contrib/crc32c_arm64.c
@@ -53,7 +53,8 @@
     (P) += 4 * SEGMENTBYTES; \
   } while (0)
 
-static constexpr const uint32_t kCRC32Xor = static_cast<uint32_t>(0xffffffffU);
+// niova-core uses linux convention of 0 xor
+#define kCRC32Xor 0x0
 uint32_t crc32_arm(uint32_t crc, const uint8_t *data, size_t size) {
   int64_t length = size;
   uint32_t crc0, crc1, crc2, crc3;

From dc39fc97a28d76c911934c67b15077d9daffe15c Mon Sep 17 00:00:00 2001
From: Paul Nowoczynski <pauln@niova.io>
Date: Mon, 25 Mar 2024 12:59:46 -0400
Subject: [PATCH 05/13] Try to add aarch64 to build workflow

---
 .github/workflows/c-cpp.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index f28e6e946..3ebb6bd45 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -10,6 +10,11 @@ jobs:
   build:
     timeout-minutes: 30
     runs-on: ubuntu-latest
+    name: Build on ${{ matrix.distro }} ${{ matrix.arch }}
+
+    strategy:
+	matrix:
+		arch: [x64, aarch64]
 
     steps:
     - uses: actions/checkout@v2

From dd42b1ccccf897a6f83afaf78ef89d41db71ba25 Mon Sep 17 00:00:00 2001
From: Kit Westneat <kit.westneat@gmail.com>
Date: Mon, 25 Mar 2024 17:00:29 +0000
Subject: [PATCH 06/13] Add arm crc32 test to micro test

---
 test/micro-test.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/micro-test.c b/test/micro-test.c
index 1d067405a..0d91cc6f9 100644
--- a/test/micro-test.c
+++ b/test/micro-test.c
@@ -199,6 +199,10 @@ simple_crc32_64byte_buf(void)
     val = crc_pcl((const unsigned char *)buffer, (sizeof(uint64_t) * 8),
                   0 ^ 0xFFFFFFFF) ^ 0xFFFFFFFF;
     (void)val;
+#elif defined(__aarch64__)
+    val = crc32_arm(0 ^ 0xFFFFFFFF, (const unsigned char *)buffer,
+		  (sizeof(uint64_t) * 8)) ^ 0xFFFFFFFF;
+    (void)val;
 #endif
 
     return;

From 859c821c0b642fe6a25872a8bb4c26167b3ca733 Mon Sep 17 00:00:00 2001
From: Paul Nowoczynski <pauln@niova.io>
Date: Mon, 25 Mar 2024 13:04:26 -0400
Subject: [PATCH 07/13] Try to add aarch64 to build workflow take 2

---
 .github/workflows/c-cpp.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 3ebb6bd45..fe13c0340 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -13,8 +13,8 @@ jobs:
     name: Build on ${{ matrix.distro }} ${{ matrix.arch }}
 
     strategy:
-	matrix:
-		arch: [x64, aarch64]
+      matrix:
+        arch: [x64, aarch64]
 
     steps:
     - uses: actions/checkout@v2

From e640cf50bf9f660286b5db45f6a0abdfc847212e Mon Sep 17 00:00:00 2001
From: Paul Nowoczynski <pauln@niova.io>
Date: Mon, 25 Mar 2024 13:18:32 -0400
Subject: [PATCH 08/13] Revert "Try to add aarch64 to build workflow take 2"

This reverts commit 859c821c0b642fe6a25872a8bb4c26167b3ca733.
---
 .github/workflows/c-cpp.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index fe13c0340..3ebb6bd45 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -13,8 +13,8 @@ jobs:
     name: Build on ${{ matrix.distro }} ${{ matrix.arch }}
 
     strategy:
-      matrix:
-        arch: [x64, aarch64]
+	matrix:
+		arch: [x64, aarch64]
 
     steps:
     - uses: actions/checkout@v2

From 8d6dd41fa6f3f9f3fdcd8db421439643f6c4da7c Mon Sep 17 00:00:00 2001
From: Paul Nowoczynski <pauln@niova.io>
Date: Mon, 25 Mar 2024 13:18:33 -0400
Subject: [PATCH 09/13] Revert "Try to add aarch64 to build workflow"

This reverts commit dc39fc97a28d76c911934c67b15077d9daffe15c.
---
 .github/workflows/c-cpp.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 3ebb6bd45..f28e6e946 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -10,11 +10,6 @@ jobs:
   build:
     timeout-minutes: 30
     runs-on: ubuntu-latest
-    name: Build on ${{ matrix.distro }} ${{ matrix.arch }}
-
-    strategy:
-	matrix:
-		arch: [x64, aarch64]
 
     steps:
     - uses: actions/checkout@v2

From e89f46fd7e62bcd8c8939dccd76e4feacbfa29ad Mon Sep 17 00:00:00 2001
From: Kit Westneat <kit.westneat@gmail.com>
Date: Wed, 27 Mar 2024 17:06:56 +0000
Subject: [PATCH 10/13] Add linux arm64 t10

---
 src/contrib/crct10dif-ce-core.S | 514 ++++++++++++++++++++++++++++++++
 1 file changed, 514 insertions(+)
 create mode 100644 src/contrib/crct10dif-ce-core.S

diff --git a/src/contrib/crct10dif-ce-core.S b/src/contrib/crct10dif-ce-core.S
new file mode 100644
index 000000000..5604de61d
--- /dev/null
+++ b/src/contrib/crct10dif-ce-core.S
@@ -0,0 +1,514 @@
+//
+// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+// Derived from the x86 version:
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+//     Erdinc Ozturk <erdinc.ozturk@intel.com>
+//     Vinodh Gopal <vinodh.gopal@intel.com>
+//     James Guilford <james.guilford@intel.com>
+//     Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses.  You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//       Reference paper titled "Fast CRC Computation for Generic
+//	Polynomials Using PCLMULQDQ Instruction"
+//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.arch		armv8-a+crypto
+
+	init_crc	.req	w0
+	buf		.req	x1
+	len		.req	x2
+	fold_consts_ptr	.req	x3
+
+	fold_consts	.req	v10
+
+	ad		.req	v14
+
+	k00_16		.req	v15
+	k32_48		.req	v16
+
+	t3		.req	v17
+	t4		.req	v18
+	t5		.req	v19
+	t6		.req	v20
+	t7		.req	v21
+	t8		.req	v22
+	t9		.req	v23
+
+	perm1		.req	v24
+	perm2		.req	v25
+	perm3		.req	v26
+	perm4		.req	v27
+
+	bd1		.req	v28
+	bd2		.req	v29
+	bd3		.req	v30
+	bd4		.req	v31
+
+	.macro		__pmull_init_p64
+	.endm
+
+	.macro		__pmull_pre_p64, bd
+	.endm
+
+	.macro		__pmull_init_p8
+	// k00_16 := 0x0000000000000000_000000000000ffff
+	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
+	movi		k32_48.2d, #0xffffffff
+	mov		k32_48.h[2], k32_48.h[0]
+	ushr		k00_16.2d, k32_48.2d, #32
+
+	// prepare the permutation vectors
+	mov_q		x5, 0x080f0e0d0c0b0a09
+	movi		perm4.8b, #8
+	dup		perm1.2d, x5
+	eor		perm1.16b, perm1.16b, perm4.16b
+	ushr		perm2.2d, perm1.2d, #8
+	ushr		perm3.2d, perm1.2d, #16
+	ushr		perm4.2d, perm1.2d, #24
+	sli		perm2.2d, perm1.2d, #56
+	sli		perm3.2d, perm1.2d, #48
+	sli		perm4.2d, perm1.2d, #40
+	.endm
+
+	.macro		__pmull_pre_p8, bd
+	tbl		bd1.16b, {\bd\().16b}, perm1.16b
+	tbl		bd2.16b, {\bd\().16b}, perm2.16b
+	tbl		bd3.16b, {\bd\().16b}, perm3.16b
+	tbl		bd4.16b, {\bd\().16b}, perm4.16b
+	.endm
+
+SYM_FUNC_START_LOCAL(__pmull_p8_core)
+.L__pmull_p8_core:
+	ext		t4.8b, ad.8b, ad.8b, #1			// A1
+	ext		t5.8b, ad.8b, ad.8b, #2			// A2
+	ext		t6.8b, ad.8b, ad.8b, #3			// A3
+
+	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
+	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
+	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
+	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
+	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
+	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
+	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
+	b		0f
+
+.L__pmull_p8_core2:
+	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
+	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
+	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
+
+	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
+	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
+	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
+	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
+	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
+	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
+	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
+
+0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
+	eor		t5.16b, t5.16b, t7.16b			// M = G + H
+	eor		t6.16b, t6.16b, t9.16b			// N = I + J
+
+	uzp1		t8.2d, t4.2d, t5.2d
+	uzp2		t4.2d, t4.2d, t5.2d
+	uzp1		t7.2d, t6.2d, t3.2d
+	uzp2		t6.2d, t6.2d, t3.2d
+
+	// t4 = (L) (P0 + P1) << 8
+	// t5 = (M) (P2 + P3) << 16
+	eor		t8.16b, t8.16b, t4.16b
+	and		t4.16b, t4.16b, k32_48.16b
+
+	// t6 = (N) (P4 + P5) << 24
+	// t7 = (K) (P6 + P7) << 32
+	eor		t7.16b, t7.16b, t6.16b
+	and		t6.16b, t6.16b, k00_16.16b
+
+	eor		t8.16b, t8.16b, t4.16b
+	eor		t7.16b, t7.16b, t6.16b
+
+	zip2		t5.2d, t8.2d, t4.2d
+	zip1		t4.2d, t8.2d, t4.2d
+	zip2		t3.2d, t7.2d, t6.2d
+	zip1		t6.2d, t7.2d, t6.2d
+
+	ext		t4.16b, t4.16b, t4.16b, #15
+	ext		t5.16b, t5.16b, t5.16b, #14
+	ext		t6.16b, t6.16b, t6.16b, #13
+	ext		t3.16b, t3.16b, t3.16b, #12
+
+	eor		t4.16b, t4.16b, t5.16b
+	eor		t6.16b, t6.16b, t3.16b
+	ret
+SYM_FUNC_END(__pmull_p8_core)
+
+	.macro		__pmull_p8, rq, ad, bd, i
+	.ifnc		\bd, fold_consts
+	.err
+	.endif
+	mov		ad.16b, \ad\().16b
+	.ifb		\i
+	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
+	.else
+	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
+	.endif
+
+	bl		.L__pmull_p8_core\i
+
+	eor		\rq\().16b, \rq\().16b, t4.16b
+	eor		\rq\().16b, \rq\().16b, t6.16b
+	.endm
+
+	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
+	// into reg1, reg2.
+	.macro		fold_32_bytes, p, reg1, reg2
+	ldp		q11, q12, [buf], #0x20
+
+	__pmull_\p	v8, \reg1, fold_consts, 2
+	__pmull_\p	\reg1, \reg1, fold_consts
+
+CPU_LE(	rev64		v11.16b, v11.16b		)
+CPU_LE(	rev64		v12.16b, v12.16b		)
+
+	__pmull_\p	v9, \reg2, fold_consts, 2
+	__pmull_\p	\reg2, \reg2, fold_consts
+
+CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
+CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
+
+	eor		\reg1\().16b, \reg1\().16b, v8.16b
+	eor		\reg2\().16b, \reg2\().16b, v9.16b
+	eor		\reg1\().16b, \reg1\().16b, v11.16b
+	eor		\reg2\().16b, \reg2\().16b, v12.16b
+	.endm
+
+	// Fold src_reg into dst_reg, optionally loading the next fold constants
+	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
+	__pmull_\p	v8, \src_reg, fold_consts
+	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
+	.ifnb		\load_next_consts
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
+	.endif
+	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
+	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
+	.endm
+
+	.macro		__pmull_p64, rd, rn, rm, n
+	.ifb		\n
+	pmull		\rd\().1q, \rn\().1d, \rm\().1d
+	.else
+	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
+	.endif
+	.endm
+
+	.macro		crc_t10dif_pmull, p
+	__pmull_init_\p
+
+	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+	cmp		len, #256
+	b.lt		.Lless_than_256_bytes_\@
+
+	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
+
+	// Load the first 128 data bytes.  Byte swapping is necessary to make
+	// the bit order match the polynomial coefficient order.
+	ldp		q0, q1, [buf]
+	ldp		q2, q3, [buf, #0x20]
+	ldp		q4, q5, [buf, #0x40]
+	ldp		q6, q7, [buf, #0x60]
+	add		buf, buf, #0x80
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	rev64		v1.16b, v1.16b			)
+CPU_LE(	rev64		v2.16b, v2.16b			)
+CPU_LE(	rev64		v3.16b, v3.16b			)
+CPU_LE(	rev64		v4.16b, v4.16b			)
+CPU_LE(	rev64		v5.16b, v5.16b			)
+CPU_LE(	rev64		v6.16b, v6.16b			)
+CPU_LE(	rev64		v7.16b, v7.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
+CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
+CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
+CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
+CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
+CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+
+	// XOR the first 16 data *bits* with the initial CRC value.
+	movi		v8.16b, #0
+	mov		v8.h[7], init_crc
+	eor		v0.16b, v0.16b, v8.16b
+
+	// Load the constants for folding across 128 bytes.
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
+	__pmull_pre_\p	fold_consts
+
+	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
+	// 128 to simplify the termination condition of the following loop.
+	sub		len, len, #256
+
+	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
+	// bytes v0-v7 into them, storing the result back into v0-v7.
+.Lfold_128_bytes_loop_\@:
+	fold_32_bytes	\p, v0, v1
+	fold_32_bytes	\p, v2, v3
+	fold_32_bytes	\p, v4, v5
+	fold_32_bytes	\p, v6, v7
+
+	subs		len, len, #128
+	b.ge		.Lfold_128_bytes_loop_\@
+
+	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
+
+	// Fold across 64 bytes.
+	add		fold_consts_ptr, fold_consts_ptr, #16
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
+	fold_16_bytes	\p, v0, v4
+	fold_16_bytes	\p, v1, v5
+	fold_16_bytes	\p, v2, v6
+	fold_16_bytes	\p, v3, v7, 1
+	// Fold across 32 bytes.
+	fold_16_bytes	\p, v4, v6
+	fold_16_bytes	\p, v5, v7, 1
+	// Fold across 16 bytes.
+	fold_16_bytes	\p, v6, v7
+
+	// Add 128 to get the correct number of data bytes remaining in 0...127
+	// (not counting v7), following the previous extra subtraction by 128.
+	// Then subtract 16 to simplify the termination condition of the
+	// following loop.
+	adds		len, len, #(128-16)
+
+	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
+	// into them, storing the result back into v7.
+	b.lt		.Lfold_16_bytes_loop_done_\@
+.Lfold_16_bytes_loop_\@:
+	__pmull_\p	v8, v7, fold_consts
+	__pmull_\p	v7, v7, fold_consts, 2
+	eor		v7.16b, v7.16b, v8.16b
+	ldr		q0, [buf], #16
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+	eor		v7.16b, v7.16b, v0.16b
+	subs		len, len, #16
+	b.ge		.Lfold_16_bytes_loop_\@
+
+.Lfold_16_bytes_loop_done_\@:
+	// Add 16 to get the correct number of data bytes remaining in 0...15
+	// (not counting v7), following the previous extra subtraction by 16.
+	adds		len, len, #16
+	b.eq		.Lreduce_final_16_bytes_\@
+
+.Lhandle_partial_segment_\@:
+	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
+	// do this without needing a fold constant for each possible 'len',
+	// redivide the bytes into a first chunk of 'len' bytes and a second
+	// chunk of 16 bytes, then fold the first chunk into the second.
+
+	// v0 = last 16 original data bytes
+	add		buf, buf, len
+	ldr		q0, [buf, #-16]
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+
+	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
+	adr_l		x4, .Lbyteshift_table + 16
+	sub		x4, x4, len
+	ld1		{v2.16b}, [x4]
+	tbl		v1.16b, {v7.16b}, v2.16b
+
+	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
+	movi		v3.16b, #0x80
+	eor		v2.16b, v2.16b, v3.16b
+	tbl		v3.16b, {v7.16b}, v2.16b
+
+	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+	sshr		v2.16b, v2.16b, #7
+
+	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
+	// then '16-len' bytes from v1 (high-order bytes).
+	bsl		v2.16b, v1.16b, v0.16b
+
+	// Fold the first chunk into the second chunk, storing the result in v7.
+	__pmull_\p	v0, v3, fold_consts
+	__pmull_\p	v7, v3, fold_consts, 2
+	eor		v7.16b, v7.16b, v0.16b
+	eor		v7.16b, v7.16b, v2.16b
+
+.Lreduce_final_16_bytes_\@:
+	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+	movi		v2.16b, #0		// init zero register
+
+	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
+
+	// Fold the high 64 bits into the low 64 bits, while also multiplying by
+	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
+	// whose low 48 bits are 0.
+	ext		v0.16b, v2.16b, v7.16b, #8
+	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
+	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
+
+	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
+	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
+	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
+	mov		v0.s[3], v2.s[0]	// zero high 32 bits
+	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
+	eor		v0.16b, v0.16b, v1.16b	// + low bits
+
+	// Load G(x) and floor(x^48 / G(x)).
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
+	__pmull_pre_\p	fold_consts
+
+	// Use Barrett reduction to compute the final CRC value.
+	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
+	ushr		v1.2d, v1.2d, #32	// /= x^32
+	__pmull_\p	v1, v1, fold_consts	// *= G(x)
+	ushr		v0.2d, v0.2d, #48
+	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
+	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+	umov		w0, v0.h[0]
+	.ifc		\p, p8
+	frame_pop
+	.endif
+	ret
+
+.Lless_than_256_bytes_\@:
+	// Checksumming a buffer of length 16...255 bytes
+
+	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+	// Load the first 16 data bytes.
+	ldr		q7, [buf], #0x10
+CPU_LE(	rev64		v7.16b, v7.16b			)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+
+	// XOR the first 16 data *bits* with the initial CRC value.
+	movi		v0.16b, #0
+	mov		v0.h[7], init_crc
+	eor		v7.16b, v7.16b, v0.16b
+
+	// Load the fold-across-16-bytes constants.
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
+
+	cmp		len, #16
+	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
+	subs		len, len, #32
+	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
+	add		len, len, #16
+	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
+	.endm
+
+//
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p8)
+	frame_push	1
+	crc_t10dif_pmull p8
+SYM_FUNC_END(crc_t10dif_pmull_p8)
+
+	.align		5
+//
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p64)
+	crc_t10dif_pmull	p64
+SYM_FUNC_END(crc_t10dif_pmull_p64)
+
+	.section	".rodata", "a"
+	.align		4
+
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
+	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
+// .Lfold_across_64_bytes_consts:
+	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
+	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
+// .Lfold_across_32_bytes_consts:
+	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
+	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
+.Lfold_across_16_bytes_consts:
+	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
+	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
+// .Lfinal_fold_consts:
+	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
+	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+	.quad		0x0000000000018bb7	// G(x)
+	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0

From ac74281d2fc98d736b8024ce9bd829135e8005f7 Mon Sep 17 00:00:00 2001
From: Kit Westneat <kit.westneat@gmail.com>
Date: Wed, 27 Mar 2024 22:19:44 +0000
Subject: [PATCH 11/13] Modify arm t10dif code to work in niova-core, and add
 to autoconf

---
 Makefile.am                                   |  4 +-
 configure.ac                                  | 42 +++++++++
 ...ct10dif-ce-core.S => crct10dif-ce-arm64.S} | 90 +++++++++++--------
 src/include/crc32.h                           |  6 ++
 4 files changed, 102 insertions(+), 40 deletions(-)
 rename src/contrib/{crct10dif-ce-core.S => crct10dif-ce-arm64.S} (92%)

diff --git a/Makefile.am b/Makefile.am
index 253bc5f28..aa8bdc33e 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,8 +18,10 @@ ARCH_SOURCES = src/contrib/crc32c-pcl-intel-asm_64.S \
         src/contrib/crct10dif-pcl-asm_64.S
 else
 if ARCH_ARM
-ARCH_SOURCES = src/contrib/crc32c_arm64.c
+ARCH_SOURCES = src/contrib/crc32c_arm64.c \
+	       src/contrib/crct10dif-ce-arm64.S
 AM_CFLAGS += -march=armv8-a+crc+crypto
+AM_CCASFLAGS = -march=armv8-a+aes+crc+crypto
 else
 ARCH_SOURCES =
 endif
diff --git a/configure.ac b/configure.ac
index 60e70bc9d..149d7565b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -141,6 +141,48 @@ AC_CHECK_LIB([rocksdb],[rocksdb_checkpoint_object_destroy],,
 # restore the original LIBS
 LIBS=$LIBS_save
 
+AC_MSG_CHECKING([for 64-bit PMULL support])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+]], [[
+#if defined(__aarch64__)
+unsigned long hwcap = getauxval(AT_HWCAP);
+if (hwcap & HWCAP_PMULL)
+	return 0;
+#endif
+return -1;
+]])
+], [
+AC_DEFINE([HAVE_PMULL64], [1], [Define as 1 if you have 64b PMULL support])
+AC_MSG_RESULT([yes])
+], [
+AC_MSG_RESULT([no])
+])
+
+AC_MSG_CHECKING([if little-endian])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#include <stdio.h>
+]], [[
+#if defined(__aarch64__)
+    union {
+      unsigned int i;
+      char c[sizeof(unsigned int)];
+    } x;
+    x.i = 1;
+    return x.c[0];
+#endif
+return -1;
+]])
+], [
+AC_DEFINE([ARM_LE], [1], [System is little endian])
+AC_MSG_RESULT([yes])
+], [
+AC_MSG_RESULT([no])
+])
+
 AC_ARG_ENABLE(
     [asan],
     [AS_HELP_STRING([--enable-asan],[address sanitizer build])],
diff --git a/src/contrib/crct10dif-ce-core.S b/src/contrib/crct10dif-ce-arm64.S
similarity index 92%
rename from src/contrib/crct10dif-ce-core.S
rename to src/contrib/crct10dif-ce-arm64.S
index 5604de61d..1f9004c59 100644
--- a/src/contrib/crct10dif-ce-core.S
+++ b/src/contrib/crct10dif-ce-arm64.S
@@ -62,41 +62,43 @@
 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 //
 
-#include <linux/linkage.h>
-#include <asm/assembler.h>
+#include "niova_core_config.h"
+#ifdef ARM_LE
+	#define CPU_LE(code...) code
+#else
+	#define CPU_LE(code...)
+#endif
 
 	.text
-	.arch		armv8-a+crypto
+#define	init_crc		w0
+#define	buf			x1
+#define	len			x2
+#define	fold_consts_ptr		x3
 
-	init_crc	.req	w0
-	buf		.req	x1
-	len		.req	x2
-	fold_consts_ptr	.req	x3
+#define	fold_consts		v10
 
-	fold_consts	.req	v10
+#define	ad			v14
 
-	ad		.req	v14
+#define	k00_16			v15
+#define	k32_48			v16
 
-	k00_16		.req	v15
-	k32_48		.req	v16
+#define	t3			v17
+#define	t4			v18
+#define	t5			v19
+#define	t6			v20
+#define	t7			v21
+#define	t8			v22
+#define	t9			v23
 
-	t3		.req	v17
-	t4		.req	v18
-	t5		.req	v19
-	t6		.req	v20
-	t7		.req	v21
-	t8		.req	v22
-	t9		.req	v23
+#define	perm1			v24
+#define	perm2			v25
+#define	perm3			v26
+#define	perm4			v27
 
-	perm1		.req	v24
-	perm2		.req	v25
-	perm3		.req	v26
-	perm4		.req	v27
-
-	bd1		.req	v28
-	bd2		.req	v29
-	bd3		.req	v30
-	bd4		.req	v31
+#define	bd1			v28
+#define	bd2			v29
+#define	bd3			v30
+#define	bd4			v31
 
 	.macro		__pmull_init_p64
 	.endm
@@ -112,7 +114,11 @@
 	ushr		k00_16.2d, k32_48.2d, #32
 
 	// prepare the permutation vectors
-	mov_q		x5, 0x080f0e0d0c0b0a09
+	// mov_q	x5, 0x080f0e0d0c0b0a09
+	movz		x5, :abs_g3:0x080f0e0d0c0b0a09
+	movk		x5, :abs_g2_nc:0x080f0e0d0c0b0a09
+	movk		x5, :abs_g1_nc:0x080f0e0d0c0b0a09
+	movk		x5, :abs_g0_nc:0x080f0e0d0c0b0a09
 	movi		perm4.8b, #8
 	dup		perm1.2d, x5
 	eor		perm1.16b, perm1.16b, perm4.16b
@@ -131,7 +137,6 @@
 	tbl		bd4.16b, {\bd\().16b}, perm4.16b
 	.endm
 
-SYM_FUNC_START_LOCAL(__pmull_p8_core)
 .L__pmull_p8_core:
 	ext		t4.8b, ad.8b, ad.8b, #1			// A1
 	ext		t5.8b, ad.8b, ad.8b, #2			// A2
@@ -194,7 +199,6 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core)
 	eor		t4.16b, t4.16b, t5.16b
 	eor		t6.16b, t6.16b, t3.16b
 	ret
-SYM_FUNC_END(__pmull_p8_core)
 
 	.macro		__pmull_p8, rq, ad, bd, i
 	.ifnc		\bd, fold_consts
@@ -263,7 +267,7 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	cmp		len, #256
 	b.lt		.Lless_than_256_bytes_\@
 
-	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
+	/*adrl*/ adr		fold_consts_ptr, .Lfold_across_128_bytes_consts
 
 	// Load the first 128 data bytes.  Byte swapping is necessary to make
 	// the bit order match the polynomial coefficient order.
@@ -369,7 +373,7 @@ CPU_LE(	rev64		v0.16b, v0.16b			)
 CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 
 	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
-	adr_l		x4, .Lbyteshift_table + 16
+	/*adrl*/ adr		x4, .Lbyteshift_table + 16
 	sub		x4, x4, len
 	ld1		{v2.16b}, [x4]
 	tbl		v1.16b, {v7.16b}, v2.16b
@@ -429,14 +433,16 @@ CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 
 	umov		w0, v0.h[0]
 	.ifc		\p, p8
-	frame_pop
+// pop frame
+	ldr		x19, [sp, #16]
+	ldp		x29, x30, [sp], #32
 	.endif
 	ret
 
 .Lless_than_256_bytes_\@:
 	// Checksumming a buffer of length 16...255 bytes
 
-	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
+	/*adrl*/ adr		fold_consts_ptr, .Lfold_across_16_bytes_consts
 
 	// Load the first 16 data bytes.
 	ldr		q7, [buf], #0x10
@@ -465,10 +471,15 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 //
 // Assumes len >= 16.
 //
-SYM_FUNC_START(crc_t10dif_pmull_p8)
-	frame_push	1
+.globl crc_t10dif_pmull_p8
+.align 16
+crc_t10dif_pmull_p8:
+// push a new frame
+	stp		x29, x30, [sp, #-32]!
+	mov		x29, sp
+	str		x19, [sp, #16]
+
 	crc_t10dif_pmull p8
-SYM_FUNC_END(crc_t10dif_pmull_p8)
 
 	.align		5
 //
@@ -476,9 +487,10 @@ SYM_FUNC_END(crc_t10dif_pmull_p8)
 //
 // Assumes len >= 16.
 //
-SYM_FUNC_START(crc_t10dif_pmull_p64)
+.globl crc_t10dif_pmull_p64
+.align 16
+crc_t10dif_pmull_p64:
 	crc_t10dif_pmull	p64
-SYM_FUNC_END(crc_t10dif_pmull_p64)
 
 	.section	".rodata", "a"
 	.align		4
diff --git a/src/include/crc32.h b/src/include/crc32.h
index b389bda4f..5f1d70973 100644
--- a/src/include/crc32.h
+++ b/src/include/crc32.h
@@ -8,6 +8,7 @@
 #define NIOVA_CRC32_H 1
 
 #include "common.h"
+#include "niova_core_config.h"
 
 typedef uint32_t crc32_t;
 
@@ -26,6 +27,11 @@ extern uint32_t
 crc32_arm(uint32_t crc, const uint8_t *data, size_t size);
 
 #define niova_crc(buf, len, init) crc32_arm(init, buf, len)
+#if HAVE_PMULL64
+#define niova_t10dif_crc crc_t10dif_pmull_p64
+#else
+#define niova_t10dif_crc crc_t10dif_pmull_p8
+#endif
 
 #endif
 

From d4f0005f31e1e309c3e334fa16f4c17ebd01ca71 Mon Sep 17 00:00:00 2001
From: Paul Nowoczynski <pauln@niova.io>
Date: Tue, 2 Apr 2024 18:50:33 +0000
Subject: [PATCH 12/13] Replace niova_core_config.h dep w/ -DHAVE_PMULL64

HAVE_PMULL64 test in configure.ac will now cause -DHAVE_PMULL64 to be
added to the CFLAGS.

Add crc_t10dif_pmull_p64() and crc_t10dif_pmull_p8() to crc.h.

simple_crc_t10dif_64byte_buf() and simple_crc_t10dif_4096byte_buf() are
enabled for arm64.
---
 Makefile.am         |  1 +
 configure.ac        |  1 +
 src/include/crc32.h |  8 +++++++-
 test/micro-test.c   | 12 ++++--------
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index aa8bdc33e..0e1b886d6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,6 +22,7 @@ ARCH_SOURCES = src/contrib/crc32c_arm64.c \
 	       src/contrib/crct10dif-ce-arm64.S
 AM_CFLAGS += -march=armv8-a+crc+crypto
 AM_CCASFLAGS = -march=armv8-a+aes+crc+crypto
+
 else
 ARCH_SOURCES =
 endif
diff --git a/configure.ac b/configure.ac
index 149d7565b..acdc12a73 100644
--- a/configure.ac
+++ b/configure.ac
@@ -158,6 +158,7 @@ return -1;
 ], [
 AC_DEFINE([HAVE_PMULL64], [1], [Define as 1 if you have 64b PMULL support])
 AC_MSG_RESULT([yes])
+AM_CFLAGS="$AM_CFLAGS -DHAVE_PMULL64"
 ], [
 AC_MSG_RESULT([no])
 ])
diff --git a/src/include/crc32.h b/src/include/crc32.h
index 5f1d70973..9f5d03332 100644
--- a/src/include/crc32.h
+++ b/src/include/crc32.h
@@ -8,7 +8,6 @@
 #define NIOVA_CRC32_H 1
 
 #include "common.h"
-#include "niova_core_config.h"
 
 typedef uint32_t crc32_t;
 
@@ -26,7 +25,14 @@ crc_t10dif_pcl(uint16_t crc_init, const unsigned char *buffer, size_t len);
 extern uint32_t
 crc32_arm(uint32_t crc, const uint8_t *data, size_t size);
 
+extern uint16_t
+crc_t10dif_pmull_p64(uint16_t crc_init, const unsigned char *buffer, size_t len);
+
+extern uint16_t
+crc_t10dif_pmull_p8(uint16_t crc_init, const unsigned char *buffer, size_t len);
+
 #define niova_crc(buf, len, init) crc32_arm(init, buf, len)
+
 #if HAVE_PMULL64
 #define niova_t10dif_crc crc_t10dif_pmull_p64
 #else
diff --git a/test/micro-test.c b/test/micro-test.c
index 0d91cc6f9..20219a1cf 100644
--- a/test/micro-test.c
+++ b/test/micro-test.c
@@ -215,11 +215,9 @@ simple_crc_t10dif_64byte_buf(void)
     uint64_t buffer[8];
     buffer[0] = val;
 
-#if defined(__x86_64__)
-    val = crc_t10dif_pcl((0 ^ 0xFFFF), (const unsigned char *)buffer,
-		        (sizeof(uint64_t) * 8)) ^ 0xFFFF;
+    val = niova_t10dif_crc((0 ^ 0xFFFF), (const unsigned char *)buffer,
+                           (sizeof(uint64_t) * 8)) ^ 0xFFFF;
     (void)val;
-#endif
 
     return;
 }
@@ -234,11 +232,9 @@ simple_crc_t10dif_4096byte_buf(void)
 	    cnt = 0;
     buffer[cnt++] = val;
 
-#if defined(__x86_64__)
-    val = crc_t10dif_pcl((0 ^ 0xFFFF), (const unsigned char *)buffer,
-                        4096) ^ 0xFFFF;
+    val = niova_t10dif_crc((0 ^ 0xFFFF), (const unsigned char *)buffer,
+                           4096) ^ 0xFFFF;
     (void)val;
-#endif
 
     return;
 }

From e56bd3ddd32613725acfa591aa469299dec833a9 Mon Sep 17 00:00:00 2001
From: Paul Nowoczynski <pauln@niova.io>
Date: Tue, 2 Apr 2024 19:16:30 +0000
Subject: [PATCH 13/13] Arm config checks are conditional based on cpu

---
 configure.ac | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/configure.ac b/configure.ac
index acdc12a73..65f323a76 100644
--- a/configure.ac
+++ b/configure.ac
@@ -52,6 +52,14 @@ AC_TYPE_UINT16_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T
 
+AC_CANONICAL_HOST
+AS_CASE(["$host_cpu"],
+    [arm*|aarch64*], [arm=true],
+    [i?86|x86_64], [x86=true]
+)
+AM_CONDITIONAL([ARCH_ARM], [test x$arm = xtrue])
+AM_CONDITIONAL([ARCH_X86], [test x$x86 = xtrue])
+
 AC_HEADER_STDC
 AC_CHECK_HEADERS([pthread.h], [],
                  [AC_MSG_ERROR([failed to locate pthread.h])], [])
@@ -141,6 +149,7 @@ AC_CHECK_LIB([rocksdb],[rocksdb_checkpoint_object_destroy],,
 # restore the original LIBS
 LIBS=$LIBS_save
 
+if [test x$arm == xtrue] ; then
 AC_MSG_CHECKING([for 64-bit PMULL support])
 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
 #include <stdio.h>
@@ -183,6 +192,7 @@ AC_MSG_RESULT([yes])
 ], [
 AC_MSG_RESULT([no])
 ])
+fi
 
 AC_ARG_ENABLE(
     [asan],
@@ -240,11 +250,4 @@ AC_ARG_ENABLE(
 
 AC_CONFIG_FILES([niova.pc])
 
-AC_CANONICAL_HOST
-AS_CASE(["$host_cpu"],
-    [arm*|aarch64*], [arm=true],
-    [i?86|x86_64], [x86=true]
-)
-AM_CONDITIONAL([ARCH_ARM], [test x$arm = xtrue])
-AM_CONDITIONAL([ARCH_X86], [test x$x86 = xtrue])
 AC_OUTPUT