From aeeb67dc6a59720285ac39770053e6a52828c292 Mon Sep 17 00:00:00 2001 From: Luke Usher Date: Wed, 14 Sep 2022 13:51:05 +0100 Subject: [PATCH] hasher: use xxh3 exclusively --- CMakeLists.txt | 3 - src/common/util/crc32c.cpp | 335 ------------------------------------- src/common/util/crc32c.h | 35 ---- src/common/util/hasher.cpp | 42 ----- src/common/util/hasher.h | 5 +- 5 files changed, 2 insertions(+), 418 deletions(-) delete mode 100644 src/common/util/crc32c.cpp delete mode 100644 src/common/util/crc32c.h delete mode 100644 src/common/util/hasher.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d10741a77..53d6350317 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,7 +77,6 @@ file (GLOB CXBXR_HEADER_COMMON "${CXBXR_ROOT_DIR}/src/common/util/cliConfig.hpp" "${CXBXR_ROOT_DIR}/src/common/util/cliConverter.hpp" "${CXBXR_ROOT_DIR}/src/common/util/CPUID.h" - "${CXBXR_ROOT_DIR}/src/common/util/crc32c.h" "${CXBXR_ROOT_DIR}/src/common/util/CxbxUtil.h" "${CXBXR_ROOT_DIR}/src/common/util/std_extend.hpp" "${CXBXR_ROOT_DIR}/src/common/util/strConverter.hpp" @@ -252,9 +251,7 @@ file (GLOB CXBXR_SOURCE_COMMON "${CXBXR_ROOT_DIR}/src/common/Settings.cpp" "${CXBXR_ROOT_DIR}/src/common/util/cliConfig.cpp" "${CXBXR_ROOT_DIR}/src/common/util/cliConverter.cpp" - "${CXBXR_ROOT_DIR}/src/common/util/crc32c.cpp" "${CXBXR_ROOT_DIR}/src/common/util/CxbxUtil.cpp" - "${CXBXR_ROOT_DIR}/src/common/util/hasher.cpp" "${CXBXR_ROOT_DIR}/src/common/win32/EmuShared.cpp" "${CXBXR_ROOT_DIR}/src/common/win32/InlineFunc.cpp" "${CXBXR_ROOT_DIR}/src/common/win32/IPCWindows.cpp" diff --git a/src/common/util/crc32c.cpp b/src/common/util/crc32c.cpp deleted file mode 100644 index 48dd00d32c..0000000000 --- a/src/common/util/crc32c.cpp +++ /dev/null @@ -1,335 +0,0 @@ -/* - Copyright (c) 2013 - 2014, 2016 Mark Adler, Robert Vazan, Max Vysokikh - - This software is provided 'as-is', without any express or implied - warranty. In no event will the author be held liable for any damages - arising from the use of this software. - - Permission is granted to anyone to use this software for any purpose, - including commercial applications, and to alter it and redistribute it - freely, subject to the following restrictions: - - 1. The origin of this software must not be misrepresented; you must not - claim that you wrote the original software. If you use this software - in a product, an acknowledgment in the product documentation would be - appreciated but is not required. - 2. Altered source versions must be plainly marked as such, and must not be - misrepresented as being the original software. - 3. This notice may not be removed or altered from any source distribution. -*/ - -#ifndef _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_WARNINGS -#endif - -#include "crc32c.h" -#include -#include - -#define POLY 0x82f63b78 -#define LONG_SHIFT 8192 -#define SHORT_SHIFT 256 - -typedef const uint8_t *buffer; - -static uint32_t table[16][256]; - -static uint32_t long_shifts[4][256]; - -static uint32_t short_shifts[4][256]; - -static bool _tableInitialized; - -void calculate_table(); - -/* Table-driven software version as a fall-back. This is about 15 times slower - than using the hardware instructions. This assumes little-endian integers, - as is the case on Intel processors that the assembler code here is for. */ -extern "C" CRC32C_API uint32_t crc32c_append_sw(uint32_t crci, buffer input, size_t length) -{ - buffer next = input; -#ifdef _M_X64 - uint64_t crc; -#else - uint32_t crc; -#endif - - crc = crci ^ 0xffffffff; -#ifdef _M_X64 - while (length && ((uintptr_t)next & 7) != 0) - { - crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); - --length; - } - while (length >= 16) - { - crc ^= *(uint64_t *)next; - uint64_t high = *(uint64_t *)(next + 8); - crc = table[15][crc & 0xff] - ^ table[14][(crc >> 8) & 0xff] - ^ table[13][(crc >> 16) & 0xff] - ^ table[12][(crc >> 24) & 0xff] - ^ table[11][(crc >> 32) & 0xff] - ^ table[10][(crc >> 40) & 0xff] - ^ table[9][(crc >> 48) & 0xff] - ^ table[8][crc >> 56] - ^ table[7][high & 0xff] - ^ table[6][(high >> 8) & 0xff] - ^ table[5][(high >> 16) & 0xff] - ^ table[4][(high >> 24) & 0xff] - ^ table[3][(high >> 32) & 0xff] - ^ table[2][(high >> 40) & 0xff] - ^ table[1][(high >> 48) & 0xff] - ^ table[0][high >> 56]; - next += 16; - length -= 16; - } -#else - while (length && ((uintptr_t)next & 3) != 0) - { - crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); - --length; - } - while (length >= 12) - { - crc ^= *(uint32_t *)next; - uint32_t high = *(uint32_t *)(next + 4); - uint32_t high2 = *(uint32_t *)(next + 8); - crc = table[11][crc & 0xff] - ^ table[10][(crc >> 8) & 0xff] - ^ table[9][(crc >> 16) & 0xff] - ^ table[8][crc >> 24] - ^ table[7][high & 0xff] - ^ table[6][(high >> 8) & 0xff] - ^ table[5][(high >> 16) & 0xff] - ^ table[4][high >> 24] - ^ table[3][high2 & 0xff] - ^ table[2][(high2 >> 8) & 0xff] - ^ table[1][(high2 >> 16) & 0xff] - ^ table[0][high2 >> 24]; - next += 12; - length -= 12; - } -#endif - while (length) - { - crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); - --length; - } - return (uint32_t)crc ^ 0xffffffff; -} - -/* Apply the zeros operator table to crc. */ -static inline uint32_t shift_crc(uint32_t shift_table[][256], uint32_t crc) -{ - return shift_table[0][crc & 0xff] - ^ shift_table[1][(crc >> 8) & 0xff] - ^ shift_table[2][(crc >> 16) & 0xff] - ^ shift_table[3][crc >> 24]; -} - -/* Compute CRC-32C using the Intel hardware instruction. */ -extern "C" CRC32C_API uint32_t crc32c_append_hw(uint32_t crc, buffer buf, size_t len) -{ - buffer next = buf; - buffer end; -#ifdef _M_X64 - uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */ -#else - uint32_t crc0, crc1, crc2; -#endif - - /* pre-process the crc */ - crc0 = crc ^ 0xffffffff; - - /* compute the crc for up to seven leading bytes to bring the data pointer - to an eight-byte boundary */ - while (len && ((uintptr_t)next & 7) != 0) - { - crc0 = _mm_crc32_u8(static_cast(crc0), *next); - ++next; - --len; - } - -#ifdef _M_X64 - /* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc - instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem, - Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a - throughput of one crc per cycle, but a latency of three cycles */ - while (len >= 3 * LONG_SHIFT) - { - crc1 = 0; - crc2 = 0; - end = next + LONG_SHIFT; - do - { - crc0 = _mm_crc32_u64(crc0, *reinterpret_cast(next)); - crc1 = _mm_crc32_u64(crc1, *reinterpret_cast(next + LONG_SHIFT)); - crc2 = _mm_crc32_u64(crc2, *reinterpret_cast(next + 2 * LONG_SHIFT)); - next += 8; - } while (next < end); - crc0 = shift_crc(long_shifts, static_cast(crc0)) ^ crc1; - crc0 = shift_crc(long_shifts, static_cast(crc0)) ^ crc2; - next += 2 * LONG_SHIFT; - len -= 3 * LONG_SHIFT; - } - - /* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less - than a LONG_SHIFT*3 block */ - while (len >= 3 * SHORT_SHIFT) - { - crc1 = 0; - crc2 = 0; - end = next + SHORT_SHIFT; - do - { - crc0 = _mm_crc32_u64(crc0, *reinterpret_cast(next)); - crc1 = _mm_crc32_u64(crc1, *reinterpret_cast(next + SHORT_SHIFT)); - crc2 = _mm_crc32_u64(crc2, *reinterpret_cast(next + 2 * SHORT_SHIFT)); - next += 8; - } while (next < end); - crc0 = shift_crc(short_shifts, static_cast(crc0)) ^ crc1; - crc0 = shift_crc(short_shifts, static_cast(crc0)) ^ crc2; - next += 2 * SHORT_SHIFT; - len -= 3 * SHORT_SHIFT; - } - - /* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3 - block */ - end = next + (len - (len & 7)); - while (next < end) - { - crc0 = _mm_crc32_u64(crc0, *reinterpret_cast(next)); - next += 8; - } -#else - /* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc - instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem, - Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a - throughput of one crc per cycle, but a latency of three cycles */ - while (len >= 3 * LONG_SHIFT) - { - crc1 = 0; - crc2 = 0; - end = next + LONG_SHIFT; - do - { - crc0 = _mm_crc32_u32(crc0, *reinterpret_cast(next)); - crc1 = _mm_crc32_u32(crc1, *reinterpret_cast(next + LONG_SHIFT)); - crc2 = _mm_crc32_u32(crc2, *reinterpret_cast(next + 2 * LONG_SHIFT)); - next += 4; - } while (next < end); - crc0 = shift_crc(long_shifts, static_cast(crc0)) ^ crc1; - crc0 = shift_crc(long_shifts, static_cast(crc0)) ^ crc2; - next += 2 * LONG_SHIFT; - len -= 3 * LONG_SHIFT; - } - - /* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less - than a LONG_SHIFT*3 block */ - while (len >= 3 * SHORT_SHIFT) - { - crc1 = 0; - crc2 = 0; - end = next + SHORT_SHIFT; - do - { - crc0 = _mm_crc32_u32(crc0, *reinterpret_cast(next)); - crc1 = _mm_crc32_u32(crc1, *reinterpret_cast(next + SHORT_SHIFT)); - crc2 = _mm_crc32_u32(crc2, *reinterpret_cast(next + 2 * SHORT_SHIFT)); - next += 4; - } while (next < end); - crc0 = shift_crc(short_shifts, static_cast(crc0)) ^ crc1; - crc0 = shift_crc(short_shifts, static_cast(crc0)) ^ crc2; - next += 2 * SHORT_SHIFT; - len -= 3 * SHORT_SHIFT; - } - - /* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3 - block */ - end = next + (len - (len & 7)); - while (next < end) - { - crc0 = _mm_crc32_u32(crc0, *reinterpret_cast(next)); - next += 4; - } -#endif - len &= 7; - - /* compute the crc for up to seven trailing bytes */ - while (len) - { - crc0 = _mm_crc32_u8(static_cast(crc0), *next); - ++next; - --len; - } - - /* return a post-processed crc */ - return static_cast(crc0) ^ 0xffffffff; -} - -extern "C" CRC32C_API int crc32c_hw_available() -{ - int info[4]; - __cpuid(info, 1); - return (info[2] & (1 << 20)) != 0; - -} - -void calculate_table() -{ - for(int i = 0; i < 256; i++) - { - uint32_t res = (uint32_t)i; - for(int t = 0; t < 16; t++) { - for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1); - table[t][i] = res; - } - } - - _tableInitialized = true; -} - -void calculate_table_hw() -{ - for(int i = 0; i < 256; i++) - { - uint32_t res = (uint32_t)i; - for (int k = 0; k < 8 * (SHORT_SHIFT - 4); k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1); - for(int t = 0; t < 4; t++) { - for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1); - short_shifts[3 - t][i] = res; - } - for (int k = 0; k < 8 * (LONG_SHIFT - 4 - SHORT_SHIFT); k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1); - for(int t = 0; t < 4; t++) { - for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1); - long_shifts[3 - t][i] = res; - } - } -} - -uint32_t (*append_func)(uint32_t, buffer, size_t); - -void __crc32_init() -{ - if (append_func == NULL) - { - // somebody can call sw version directly, so, precalculate table for this version - calculate_table(); - if (crc32c_hw_available()) { - calculate_table_hw(); - append_func = crc32c_append_hw; - } else { - append_func = crc32c_append_sw; - } - } -} - -extern "C" CRC32C_API uint32_t crc32c_append(uint32_t crc, buffer input, size_t length) -{ - if (append_func == NULL) { - __crc32_init(); - } - - return append_func(crc, input, length); -} diff --git a/src/common/util/crc32c.h b/src/common/util/crc32c.h deleted file mode 100644 index 07e8246221..0000000000 --- a/src/common/util/crc32c.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef CRC32C_H -#define CRC32C_H - -#define CRC32C_API - -#include - -/* - Computes CRC-32C (Castagnoli) checksum. Uses Intel's CRC32 instruction if it is available. - Otherwise it uses a very fast software fallback. -*/ -extern "C" CRC32C_API uint32_t crc32c_append( - uint32_t crc, // Initial CRC value. Typically it's 0. - // You can supply non-trivial initial value here. - // Initial value can be used to chain CRC from multiple buffers. - const uint8_t *input, // Data to be put through the CRC algorithm. - size_t length); // Length of the data in the input buffer. - - -/* - Software fallback version of CRC-32C (Castagnoli) checksum. -*/ -extern "C" CRC32C_API uint32_t crc32c_append_sw(uint32_t crc, const uint8_t *input, size_t length); - -/* - Hardware version of CRC-32C (Castagnoli) checksum. Will fail, if CPU does not support related instructions. Use a crc32c_append version instead of. -*/ -extern "C" CRC32C_API uint32_t crc32c_append_hw(uint32_t crc, const uint8_t *input, size_t length); - -/* - Checks is hardware version of CRC-32C is available. -*/ -extern "C" CRC32C_API int crc32c_hw_available(); - -#endif diff --git a/src/common/util/hasher.cpp b/src/common/util/hasher.cpp deleted file mode 100644 index 5e4bb96332..0000000000 --- a/src/common/util/hasher.cpp +++ /dev/null @@ -1,42 +0,0 @@ -#include "hasher.h" - -#include "xxhash.h" -#include "crc32c.h" -#include - -enum { - HASH_NONE = 0, - HASH_XXH3, - HASH_CRC32C -}; - -static int g_HashAlgorithm = HASH_NONE; - -void InitHasher() -{ - // Detect the best hashing algorithm to use for the host machine - // TODO/Future Improvement: This could be expanded to support even more hash algorithims - // And we could hash a random buffer to calculate the fastest hash to use on a given host - printf("Selecting hash algorithm: "); - if (crc32c_hw_available()) { - printf("CRC32C\n"); - g_HashAlgorithm = HASH_CRC32C; - } else { - printf("XXH3\n"); - g_HashAlgorithm = HASH_XXH3; - } -} - -uint64_t ComputeHash(const void* data, size_t len) -{ - if (g_HashAlgorithm == HASH_NONE) { - InitHasher(); - } - - switch (g_HashAlgorithm) { - case HASH_XXH3: return XXH3_64bits(data, len); - case HASH_CRC32C: return crc32c_append(0, (uint8_t*)data, len); - } - - return 0; -} diff --git a/src/common/util/hasher.h b/src/common/util/hasher.h index a19a539ca2..9c8da5004e 100644 --- a/src/common/util/hasher.h +++ b/src/common/util/hasher.h @@ -27,8 +27,7 @@ #ifndef _HASHER_H #define _HASHER_H -#include - -uint64_t ComputeHash(const void* data, size_t len); +#include "xxhash.h" +#define ComputeHash XXH3_64bits #endif