From 53cd8464e15ccef6dbff6a2406d9cd5a6d6dd33e Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Mon, 4 Mar 2024 19:05:05 +0900 Subject: [PATCH] wip --- .ci/cross-check.sh | 11 ++- .github/workflows/cross-test.yml | 20 ++++ Makefile | 2 + sse2neon.h | 12 +-- tests/impl.cpp | 153 +++++++++++++++++++------------ 5 files changed, 129 insertions(+), 69 deletions(-) create mode 100644 .github/workflows/cross-test.yml diff --git a/.ci/cross-check.sh b/.ci/cross-check.sh index 50a0ad21..862fe2e5 100755 --- a/.ci/cross-check.sh +++ b/.ci/cross-check.sh @@ -13,12 +13,15 @@ fi set -e -wget -O gcc-arm-8.3-2019.02-x86_64-arm-linux-gnueabihf.tar.xz "https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.02/gcc-arm-8.3-2019.02-x86_64-arm-linux-gnueabihf.tar.xz?revision=ac15fd02-ba82-40dd-8b9a-8e5996988618&rev=ac15fd02ba8240dd8b9a8e5996988618&hash=347FC4F06948A4C49D8DFC6D847DC1AA090D3588" -tar Jxvf gcc-arm-8.3-2019.02-x86_64-arm-linux-gnueabihf.tar.xz +wget -O gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz "https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz?revision=2e88a73f-d233-4f96-b1f4-d8b36e9bb0b9&rev=2e88a73fd2334f96b1f4d8b36e9bb0b9&hash=860E7F96815DDDC743E32589F0924011" +tar Jxvf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz make clean -export PATH=gcc-arm-8.3-2019.02-x86_64-arm-linux-gnueabihf/bin:$PATH -make CROSS_COMPILE=arm-linux-gnueabihf- check || exit 1 # ARMv8-A +export PATH=gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu/bin:$PATH +make CROSS_COMPILE=aarch64-linux-gnu- check || exit 1 # ARMv8-A + +# aarch64-linux-gnu-g++ -Wall -Wcast-qual -I. -march=armv8-a+fp+simd test.c -o test + # make clean # export PATH=gcc-arm-${GCC_REL}-x86_64-arm-none-linux-gnueabihf/bin:$PATH diff --git a/.github/workflows/cross-test.yml b/.github/workflows/cross-test.yml new file mode 100644 index 00000000..2bcac483 --- /dev/null +++ b/.github/workflows/cross-test.yml @@ -0,0 +1,20 @@ +name: CROSS TEST + +on: [push, pull_request] + +jobs: + host-x86: + runs-on: ubuntu-20.04 + strategy: + matrix: + arch: [x86_64] + cxx_compiler: [g++-10, clang++-11] + steps: + - name: checkout code + uses: actions/checkout@v4 + - name: build artifact + env: + CXX: ${{ matrix.cxx_compiler }} + run: | + sh .ci/cross-tool.sh + sh .ci/cross-check.sh diff --git a/Makefile b/Makefile index 999a3a7b..65ace9f2 100644 --- a/Makefile +++ b/Makefile @@ -69,6 +69,8 @@ deps := $(OBJS:%.o=%.o.d) .SUFFIXES: .o .cpp .cpp.o: $(CXX) -o $@ $(CXXFLAGS) -c -MMD -MF $@.d $< + # $(CXX) -S $(CXXFLAGS) -g $< -o $@.s -MMD -MF $@.d -v + # cat tests/*.s EXEC = tests/main diff --git a/sse2neon.h b/sse2neon.h index 8e0044a9..8ec819fc 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -1774,7 +1774,7 @@ FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) #if defined(_MSC_VER) _WriteStatusReg(ARM64_FPCR, value); #else - __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ + __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ #endif } @@ -2385,7 +2385,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) #if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } @@ -2449,7 +2449,7 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) #if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } @@ -8644,9 +8644,9 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \ ((uint32_t) (b1) << 8) | (uint32_t) (b0)) -// muliplying 'x' by 2 in GF(2^8) + // muliplying 'x' by 2 in GF(2^8) #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) -// muliplying 'x' by 3 in GF(2^8) + // muliplying 'x' by 3 in GF(2^8) #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) #define SSE2NEON_AES_U0(p) \ SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) @@ -9175,7 +9175,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) #if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } diff --git a/tests/impl.cpp b/tests/impl.cpp index 22170608..6a9e6f1d 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -2813,26 +2813,31 @@ result_t test_mm_sfence(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_shuffle_pi16(const SSE2NEONTestImpl &impl, uint32_t iter) { +#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \ + (__GNUC__ == 9 && __GNUC_MINOR__ == 2) + // gcc-8.3 would cause operand mismatch error here + return TEST_UNIMPL; +#else const int16_t *_a = (const int16_t *) impl.mTestIntPointer1; __m64 a; __m64 d; - -#define TEST_IMPL(IDX) \ - a = load_m64(_a); \ - d = _mm_shuffle_pi16(a, IDX); \ - \ - int16_t _d##IDX[4]; \ - _d##IDX[0] = _a[IDX & 0x3]; \ - _d##IDX[1] = _a[(IDX >> 2) & 0x3]; \ - _d##IDX[2] = _a[(IDX >> 4) & 0x3]; \ - _d##IDX[3] = _a[(IDX >> 6) & 0x3]; \ - if (VALIDATE_INT16_M64(d, _d##IDX) != TEST_SUCCESS) { \ - return TEST_FAIL; \ + int16_t _d[4]; +#define TEST_IMPL(IDX) \ + a = load_m64(_a); \ + d = _mm_shuffle_pi16(a, IDX); \ + \ + _d[0] = _a[IDX & 0x3]; \ + _d[1] = _a[(IDX >> 2) & 0x3]; \ + _d[2] = _a[(IDX >> 4) & 0x3]; \ + _d[3] = _a[(IDX >> 6) & 0x3]; \ + if (VALIDATE_INT16_M64(d, _d) != TEST_SUCCESS) { \ + return TEST_FAIL; \ } IMM_256_ITER #undef TEST_IMPL return TEST_SUCCESS; +#endif } // Note, NEON does not have a general purpose shuffled command like SSE. @@ -5088,6 +5093,11 @@ result_t test_mm_maskmoveu_si128(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_max_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) { +#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \ + (__GNUC__ == 9 && __GNUC_MINOR__ == 2) + // gcc-8.3 would cause operand mismatch error here + return TEST_UNIMPL; +#else const int16_t *_a = (const int16_t *) impl.mTestIntPointer1; const int16_t *_b = (const int16_t *) impl.mTestIntPointer2; int16_t d[8]; @@ -5105,10 +5115,16 @@ result_t test_mm_max_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) __m128i c = _mm_max_epi16(a, b); return VALIDATE_INT16_M128(c, d); +#endif } result_t test_mm_max_epu8(const SSE2NEONTestImpl &impl, uint32_t iter) { +#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \ + (__GNUC__ == 9 && __GNUC_MINOR__ == 2) + // gcc-8.3 would cause operand mismatch error here + return TEST_UNIMPL; +#else const int8_t *_a = (const int8_t *) impl.mTestIntPointer1; const int8_t *_b = (const int8_t *) impl.mTestIntPointer2; uint8_t d[16]; @@ -5149,6 +5165,7 @@ result_t test_mm_max_epu8(const SSE2NEONTestImpl &impl, uint32_t iter) __m128i b = load_m128i(_b); __m128i c = _mm_max_epu8(a, b); return VALIDATE_INT8_M128(c, d); +#endif } result_t test_mm_max_pd(const SSE2NEONTestImpl &impl, uint32_t iter) @@ -5842,23 +5859,29 @@ result_t test_mm_setzero_si128(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_shuffle_epi32(const SSE2NEONTestImpl &impl, uint32_t iter) { +#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \ + (__GNUC__ == 9 && __GNUC_MINOR__ == 2) + // gcc-8.3 would cause operand mismatch error here + return TEST_UNIMPL; +#else const int32_t *_a = impl.mTestIntPointer1; __m128i a, c; + int32_t _d[4]; -#define TEST_IMPL(IDX) \ - int32_t d##IDX[4]; \ - d##IDX[0] = _a[((IDX) &0x3)]; \ - d##IDX[1] = _a[((IDX >> 2) & 0x3)]; \ - d##IDX[2] = _a[((IDX >> 4) & 0x3)]; \ - d##IDX[3] = _a[((IDX >> 6) & 0x3)]; \ - \ - a = load_m128i(_a); \ - c = _mm_shuffle_epi32(a, IDX); \ - CHECK_RESULT(VALIDATE_INT32_M128(c, d##IDX)) +#define TEST_IMPL(IDX) \ + _d[0] = _a[((IDX) & 0x3)]; \ + _d[1] = _a[((IDX >> 2) & 0x3)]; \ + _d[2] = _a[((IDX >> 4) & 0x3)]; \ + _d[3] = _a[((IDX >> 6) & 0x3)]; \ + \ + a = load_m128i(_a); \ + c = _mm_shuffle_epi32(a, IDX); \ + CHECK_RESULT(VALIDATE_INT32_M128(c, _d)) IMM_256_ITER #undef TEST_IMPL return TEST_SUCCESS; +#endif } result_t test_mm_shuffle_pd(const SSE2NEONTestImpl &impl, uint32_t iter) @@ -5883,60 +5906,72 @@ result_t test_mm_shuffle_pd(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_shufflehi_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) { +#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \ + (__GNUC__ == 9 && __GNUC_MINOR__ == 2) + // gcc-8.3 would cause operand mismatch error here + return TEST_UNIMPL; +#else const int16_t *_a = (const int16_t *) impl.mTestIntPointer1; __m128i a, c; -#define TEST_IMPL(IDX) \ - int16_t d##IDX[8]; \ - d##IDX[0] = _a[0]; \ - d##IDX[1] = _a[1]; \ - d##IDX[2] = _a[2]; \ - d##IDX[3] = _a[3]; \ - d##IDX[4] = (int16_t) (((const int64_t *) _a)[1] >> ((IDX & 0x3) * 16)); \ - d##IDX[5] = \ - (int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 2) & 0x3) * 16)); \ - d##IDX[6] = \ - (int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 4) & 0x3) * 16)); \ - d##IDX[7] = \ - (int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 6) & 0x3) * 16)); \ - \ - a = load_m128i(_a); \ - c = _mm_shufflehi_epi16(a, IDX); \ - \ - CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX)) + int16_t _d[8]; +#define TEST_IMPL(IDX) \ + _d[0] = _a[0]; \ + _d[1] = _a[1]; \ + _d[2] = _a[2]; \ + _d[3] = _a[3]; \ + _d[4] = (int16_t) (((const int64_t *) _a)[1] >> ((IDX & 0x3) * 16)); \ + _d[5] = \ + (int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 2) & 0x3) * 16)); \ + _d[6] = \ + (int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 4) & 0x3) * 16)); \ + _d[7] = \ + (int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 6) & 0x3) * 16)); \ + \ + a = load_m128i(_a); \ + c = _mm_shufflehi_epi16(a, IDX); \ + \ + CHECK_RESULT(VALIDATE_INT16_M128(c, _d)) IMM_256_ITER #undef TEST_IMPL return TEST_SUCCESS; +#endif } result_t test_mm_shufflelo_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) { +#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \ + (__GNUC__ == 9 && __GNUC_MINOR__ == 2) + // gcc-8.3 would cause operand mismatch error here + return TEST_UNIMPL; +#else const int16_t *_a = (const int16_t *) impl.mTestIntPointer1; __m128i a, c; - -#define TEST_IMPL(IDX) \ - int16_t d##IDX[8]; \ - d##IDX[0] = (int16_t) (((const int64_t *) _a)[0] >> ((IDX & 0x3) * 16)); \ - d##IDX[1] = \ - (int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 2) & 0x3) * 16)); \ - d##IDX[2] = \ - (int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 4) & 0x3) * 16)); \ - d##IDX[3] = \ - (int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 6) & 0x3) * 16)); \ - d##IDX[4] = _a[4]; \ - d##IDX[5] = _a[5]; \ - d##IDX[6] = _a[6]; \ - d##IDX[7] = _a[7]; \ - \ - a = load_m128i(_a); \ - c = _mm_shufflelo_epi16(a, IDX); \ - \ - CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX)) + int16_t _d[8]; + +#define TEST_IMPL(IDX) \ + _d[0] = (int16_t) (((const int64_t *) _a)[0] >> ((IDX & 0x3) * 16)); \ + _d[1] = \ + (int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 2) & 0x3) * 16)); \ + _d[2] = \ + (int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 4) & 0x3) * 16)); \ + _d[3] = \ + (int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 6) & 0x3) * 16)); \ + _d[4] = _a[4]; \ + _d[5] = _a[5]; \ + _d[6] = _a[6]; \ + _d[7] = _a[7]; \ + \ + a = load_m128i(_a); \ + c = _mm_shufflelo_epi16(a, IDX); \ + \ + CHECK_RESULT(VALIDATE_INT16_M128(c, _d)) IMM_256_ITER #undef TEST_IMPL return TEST_SUCCESS; +#endif } result_t test_mm_sll_epi16(const SSE2NEONTestImpl &impl, uint32_t iter)