From 3801d5c8af8af79111738f86bdde0f81ff40cb56 Mon Sep 17 00:00:00 2001 From: Debayan Ghosh Date: Fri, 4 May 2018 11:14:33 +0000 Subject: [PATCH] [Arm64] lj_new_str() crc32 optimization --- src/Makefile | 2 +- src/arm64/src/lj_str_hash_arm64.h | 271 ++++++++++++++++++++++++++++++ src/lj_str.c | 5 + 3 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 src/arm64/src/lj_str_hash_arm64.h diff --git a/src/Makefile b/src/Makefile index 71ca028cd..1e4516527 100644 --- a/src/Makefile +++ b/src/Makefile @@ -50,7 +50,7 @@ CCOPT= -O2 -fomit-frame-pointer CCOPT_x86= -march=i686 -msse -msse2 -mfpmath=sse CCOPT_x64= CCOPT_arm= -CCOPT_arm64= +CCOPT_arm64= -march=armv8-a+crc CCOPT_ppc= CCOPT_mips= # diff --git a/src/arm64/src/lj_str_hash_arm64.h b/src/arm64/src/lj_str_hash_arm64.h new file mode 100644 index 000000000..8315f5820 --- /dev/null +++ b/src/arm64/src/lj_str_hash_arm64.h @@ -0,0 +1,271 @@ +/* + * This file defines string hash function using CRC32. It takes advantage of + * Arm64 hardware support (crc32 instruction) to speedup the CRC32 + * computation. The hash functions try to compute CRC32 of length and up + * to 128 bytes of given string. + */ + +#ifndef _LJ_STR_HASH_ARM64_H_ +#define _LJ_STR_HASH_ARM64_H_ + +#if defined(__aarch64__) && defined(__GNUC__) + +#include +#include +#include +#include +#include +#include +#include + +#include "../../lj_def.h" + +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1 << 7) +#endif /* HWCAP for crc32 */ + +#ifndef LJ_AINLINE +#define LJ_AINLINE inline __attribute__((always_inline)) +#endif + +#ifdef __MINGW32__ +#define random() ((long) rand()) +#define srandom(seed) srand(seed) +#endif + +extern uint32_t lj_str_original_hash(const char *str, size_t lenx); +static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len); +/* lj_str hash function determined at runtime */ +typedef uint32_t (*lj_str_hash_func)(const char *str, size_t lenx); +lj_str_hash_func LJ_STR_HASH; + +static const uint64_t* cast_uint64p(const char* str) +{ + return (const uint64_t*)(void*)str; +} + +static const uint32_t* cast_uint32p(const char* str) +{ + return (const uint32_t*)(void*)str; +} + +static LJ_AINLINE uint32_t lj_str_hash_1_4(const char* str, uint32_t len) +{ + uint32_t v = str[0], h = 0; + v = (v << 8) | str[len >> 1]; + v = (v << 8) | str[len - 1]; + v = (v << 8) | len; + return __crc32cw(h, v); +} + +static LJ_AINLINE uint32_t lj_str_hash_4_16(const char* str, size_t len) +{ + uint64_t v1, v2, h = 0; + + if (len >= 8) { + v1 = *cast_uint64p(str); + v2 = *cast_uint64p(str + len - 8); + } else { + v1 = *cast_uint32p(str); + v2 = *cast_uint32p(str + len - 4); + } + + h = __crc32cw(h, len); + h = __crc32cd(h, v1); + h = __crc32cd(h, v2); + + return h; +} + +static LJ_AINLINE uint32_t lj_str_hash_16_128(const char* str, size_t len) +{ + uint64_t h1 = 0, h2 = 0; + uint32_t i; + + h1 = __crc32cw(h1, len); + + for (i = 0; i < len - 16; i += 16) { + h1 += __crc32cd(h1, *cast_uint64p(str + i)); + h2 += __crc32cd(h2, *cast_uint64p(str + i + 8)); + } + + h1 = __crc32cd(h1, *cast_uint64p(str + len - 16)); + h2 = __crc32cd(h2, *cast_uint64p(str + len - 8)); + + return __crc32cw(h1, h2); +} + +/* ************************************************************************** + * + * Following is code about hashing string with length >= 128 + * + * ************************************************************************** + */ + +static uint32_t random_pos[32][2]; +static const int8_t log2_tab[128] = { -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4, + 4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 }; + +/* return floor(log2(n)) */ +static LJ_AINLINE uint32_t log2_floor(uint32_t n) +{ + if (n <= 127) { + return log2_tab[n]; + } + + if ((n >> 8) <= 127) { + return log2_tab[n >> 8] + 8; + } + + if ((n >> 16) <= 127) { + return log2_tab[n >> 16] + 16; + } + + if ((n >> 24) <= 127) { + return log2_tab[n >> 24] + 24; + } + + return 31; +} + +#define POW2_MASK(n) ((1L << (n)) - 1) +/* This function is to populate `random_pos` such that random_pos[i][*] + * contains random value in the range of [2**i, 2**(i+1)). + */ +static void arm64_init_random(void) +{ + int i, seed, rml; + + /* Calculate the ceil(log2(RAND_MAX)) */ + rml = log2_floor(RAND_MAX); + if (RAND_MAX & (RAND_MAX - 1)) { + rml += 1; + } + + /* Init seed */ + seed = 0; + seed = __crc32cw(seed, getpid()); + seed = __crc32cw(seed, time(NULL)); + srandom(seed); + + /* Now start to populate the random_pos[][]. */ + for (i = 0; i < 3; i++) { + /* No need to provide random value for chunk smaller than 8 bytes */ + random_pos[i][0] = random_pos[i][1] = 0; + } + + for (; i < rml; i++) { + random_pos[i][0] = random() & POW2_MASK(i+1); + random_pos[i][1] = random() & POW2_MASK(i+1); + } + + for (; i < 31; i++) { + int j; + for (j = 0; j < 2; j++) { + uint32_t v, scale; + scale = random_pos[i - rml][0]; + if (scale == 0) { + scale = 1; + } + v = (random() * scale) & POW2_MASK(i+1); + random_pos[i][j] = v; + } + } +} +#undef POW2_MASK + +void __attribute__((constructor)) arm64_init_constructor() +{ + // Check if crc32 supported. + unsigned long hwcap; + hwcap = getauxval(AT_HWCAP); + if (hwcap & HWCAP_CRC32) { + LJ_STR_HASH = lj_str_hash; + } + else { + LJ_STR_HASH = lj_str_original_hash; + } + + // init random + arm64_init_random(); +} + +/* Return a pre-computed random number in the range of [1**chunk_sz_order, + * 1**(chunk_sz_order+1)). It is "unsafe" in the sense that the return value + * may be greater than chunk-size; it is up to the caller to make sure + * "chunk-base + return-value-of-this-func" has valid virtual address. + */ +static LJ_AINLINE uint32_t get_random_pos_unsafe(uint32_t chunk_sz_order, + uint32_t idx) +{ + uint32_t pos = random_pos[chunk_sz_order][idx & 1]; + return pos; +} + +static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str, + uint32_t len) +{ + uint32_t chunk_num, chunk_sz, chunk_sz_log2, i, pos1, pos2; + uint32_t h1, h2, v; + const char* chunk_ptr; + + chunk_num = 16; + chunk_sz = len / chunk_num; + chunk_sz_log2 = log2_floor(chunk_sz); + + pos1 = get_random_pos_unsafe(chunk_sz_log2, 0); + pos2 = get_random_pos_unsafe(chunk_sz_log2, 1); + + h1 = 0; + h1 = __crc32cw(h1, len); + h2 = 0; + + /* loop over 14 chunks, 2 chunks at a time */ + for (i = 0, chunk_ptr = str; i < (chunk_num / 2 - 1); + chunk_ptr += chunk_sz, i++) { + + v = *cast_uint64p(chunk_ptr + pos1); + h1 = __crc32cd(h1, v); + + v = *cast_uint64p(chunk_ptr + chunk_sz + pos2); + h2 = __crc32cd(h2, v); + } + + /* the last two chunks */ + v = *cast_uint64p(chunk_ptr + pos1); + h1 = __crc32cd(h1, v); + + v = *cast_uint64p(chunk_ptr + chunk_sz - 8 - pos2); + h2 = __crc32cd(h2, v); + + /* process the trailing part */ + h1 = __crc32cd(h1, *cast_uint64p(str)); + h2 = __crc32cd(h2, *cast_uint64p(str + len - 8)); + + h1 = __crc32cw(h1, h2); + return h1; +} + + +/* NOTE: the "len" should not be zero */ +static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len) +{ + if (len < 128) { + if (len >= 16) { + return lj_str_hash_16_128(str, len); + } + + if ((len >= 4) && (len < 16)) { + return lj_str_hash_4_16(str, len); + } + + return lj_str_hash_1_4(str, len); + } + return lj_str_hash_128_above(str, len); +} + +#endif // defined(__aarch64__) +#endif // _LJ_STR_HASH_ARM64_H_ diff --git a/src/lj_str.c b/src/lj_str.c index b9469ca00..747a33f0b 100644 --- a/src/lj_str.c +++ b/src/lj_str.c @@ -163,6 +163,10 @@ lj_str_indep_hash(GCstr *str) { return lj_str_original_hash(strdata(str), str->len); } +#if defined(__aarch64__) +/* AArch64 CRC32 support determined at runtime */ +#include "arm64/src/lj_str_hash_arm64.h" +#else /* x64 */ #include "x64/src/lj_str_hash_x64.h" #if defined(LJ_ARCH_STR_HASH) @@ -170,6 +174,7 @@ lj_str_indep_hash(GCstr *str) { #else #define LJ_STR_HASH lj_str_original_hash #endif +#endif /* Intern a string and return string object. */ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)