Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Mar 4, 2024
1 parent e16335b commit 53cd846
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 69 deletions.
11 changes: 7 additions & 4 deletions .ci/cross-check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@ fi

set -e

wget -O gcc-arm-8.3-2019.02-x86_64-arm-linux-gnueabihf.tar.xz "https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.02/gcc-arm-8.3-2019.02-x86_64-arm-linux-gnueabihf.tar.xz?revision=ac15fd02-ba82-40dd-8b9a-8e5996988618&rev=ac15fd02ba8240dd8b9a8e5996988618&hash=347FC4F06948A4C49D8DFC6D847DC1AA090D3588"
tar Jxvf gcc-arm-8.3-2019.02-x86_64-arm-linux-gnueabihf.tar.xz
wget -O gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz "https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz?revision=2e88a73f-d233-4f96-b1f4-d8b36e9bb0b9&rev=2e88a73fd2334f96b1f4d8b36e9bb0b9&hash=860E7F96815DDDC743E32589F0924011"
tar Jxvf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz

make clean
export PATH=gcc-arm-8.3-2019.02-x86_64-arm-linux-gnueabihf/bin:$PATH
make CROSS_COMPILE=arm-linux-gnueabihf- check || exit 1 # ARMv8-A
export PATH=gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu/bin:$PATH
make CROSS_COMPILE=aarch64-linux-gnu- check || exit 1 # ARMv8-A

# aarch64-linux-gnu-g++ -Wall -Wcast-qual -I. -march=armv8-a+fp+simd test.c -o test


# make clean
# export PATH=gcc-arm-${GCC_REL}-x86_64-arm-none-linux-gnueabihf/bin:$PATH
Expand Down
20 changes: 20 additions & 0 deletions .github/workflows/cross-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: CROSS TEST

on: [push, pull_request]

jobs:
host-x86:
runs-on: ubuntu-20.04
strategy:
matrix:
arch: [x86_64]
cxx_compiler: [g++-10, clang++-11]
steps:
- name: checkout code
uses: actions/checkout@v4
- name: build artifact
env:
CXX: ${{ matrix.cxx_compiler }}
run: |
sh .ci/cross-tool.sh
sh .ci/cross-check.sh
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ deps := $(OBJS:%.o=%.o.d)
.SUFFIXES: .o .cpp
.cpp.o:
$(CXX) -o $@ $(CXXFLAGS) -c -MMD -MF $@.d $<
# $(CXX) -S $(CXXFLAGS) -g $< -o [email protected] -MMD -MF [email protected] -v
# cat tests/*.s

EXEC = tests/main

Expand Down
12 changes: 6 additions & 6 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -1774,7 +1774,7 @@ FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
#if defined(_MSC_VER)
_WriteStatusReg(ARM64_FPCR, value);
#else
__asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
__asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
#endif
}

Expand Down Expand Up @@ -2385,7 +2385,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
}

Expand Down Expand Up @@ -2449,7 +2449,7 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
}

Expand Down Expand Up @@ -8644,9 +8644,9 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
(((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
((uint32_t) (b1) << 8) | (uint32_t) (b0))
// muliplying 'x' by 2 in GF(2^8)
// muliplying 'x' by 2 in GF(2^8)
#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
// muliplying 'x' by 3 in GF(2^8)
// muliplying 'x' by 3 in GF(2^8)
#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
#define SSE2NEON_AES_U0(p) \
SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
Expand Down Expand Up @@ -9175,7 +9175,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
}

Expand Down
153 changes: 94 additions & 59 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2813,26 +2813,31 @@ result_t test_mm_sfence(const SSE2NEONTestImpl &impl, uint32_t iter)

result_t test_mm_shuffle_pi16(const SSE2NEONTestImpl &impl, uint32_t iter)
{
#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \
(__GNUC__ == 9 && __GNUC_MINOR__ == 2)
// gcc-8.3 would cause operand mismatch error here
return TEST_UNIMPL;
#else
const int16_t *_a = (const int16_t *) impl.mTestIntPointer1;
__m64 a;
__m64 d;

#define TEST_IMPL(IDX) \
a = load_m64(_a); \
d = _mm_shuffle_pi16(a, IDX); \
\
int16_t _d##IDX[4]; \
_d##IDX[0] = _a[IDX & 0x3]; \
_d##IDX[1] = _a[(IDX >> 2) & 0x3]; \
_d##IDX[2] = _a[(IDX >> 4) & 0x3]; \
_d##IDX[3] = _a[(IDX >> 6) & 0x3]; \
if (VALIDATE_INT16_M64(d, _d##IDX) != TEST_SUCCESS) { \
return TEST_FAIL; \
int16_t _d[4];
#define TEST_IMPL(IDX) \
a = load_m64(_a); \
d = _mm_shuffle_pi16(a, IDX); \
\
_d[0] = _a[IDX & 0x3]; \
_d[1] = _a[(IDX >> 2) & 0x3]; \
_d[2] = _a[(IDX >> 4) & 0x3]; \
_d[3] = _a[(IDX >> 6) & 0x3]; \
if (VALIDATE_INT16_M64(d, _d) != TEST_SUCCESS) { \
return TEST_FAIL; \
}

IMM_256_ITER
#undef TEST_IMPL
return TEST_SUCCESS;
#endif
}

// Note, NEON does not have a general purpose shuffled command like SSE.
Expand Down Expand Up @@ -5088,6 +5093,11 @@ result_t test_mm_maskmoveu_si128(const SSE2NEONTestImpl &impl, uint32_t iter)

result_t test_mm_max_epi16(const SSE2NEONTestImpl &impl, uint32_t iter)
{
#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \
(__GNUC__ == 9 && __GNUC_MINOR__ == 2)
// gcc-8.3 would cause operand mismatch error here
return TEST_UNIMPL;
#else
const int16_t *_a = (const int16_t *) impl.mTestIntPointer1;
const int16_t *_b = (const int16_t *) impl.mTestIntPointer2;
int16_t d[8];
Expand All @@ -5105,10 +5115,16 @@ result_t test_mm_max_epi16(const SSE2NEONTestImpl &impl, uint32_t iter)

__m128i c = _mm_max_epi16(a, b);
return VALIDATE_INT16_M128(c, d);
#endif
}

result_t test_mm_max_epu8(const SSE2NEONTestImpl &impl, uint32_t iter)
{
#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \
(__GNUC__ == 9 && __GNUC_MINOR__ == 2)
// gcc-8.3 would cause operand mismatch error here
return TEST_UNIMPL;
#else
const int8_t *_a = (const int8_t *) impl.mTestIntPointer1;
const int8_t *_b = (const int8_t *) impl.mTestIntPointer2;
uint8_t d[16];
Expand Down Expand Up @@ -5149,6 +5165,7 @@ result_t test_mm_max_epu8(const SSE2NEONTestImpl &impl, uint32_t iter)
__m128i b = load_m128i(_b);
__m128i c = _mm_max_epu8(a, b);
return VALIDATE_INT8_M128(c, d);
#endif
}

result_t test_mm_max_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
Expand Down Expand Up @@ -5842,23 +5859,29 @@ result_t test_mm_setzero_si128(const SSE2NEONTestImpl &impl, uint32_t iter)

result_t test_mm_shuffle_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
{
#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \
(__GNUC__ == 9 && __GNUC_MINOR__ == 2)
// gcc-8.3 would cause operand mismatch error here
return TEST_UNIMPL;
#else
const int32_t *_a = impl.mTestIntPointer1;
__m128i a, c;
int32_t _d[4];

#define TEST_IMPL(IDX) \
int32_t d##IDX[4]; \
d##IDX[0] = _a[((IDX) &0x3)]; \
d##IDX[1] = _a[((IDX >> 2) & 0x3)]; \
d##IDX[2] = _a[((IDX >> 4) & 0x3)]; \
d##IDX[3] = _a[((IDX >> 6) & 0x3)]; \
\
a = load_m128i(_a); \
c = _mm_shuffle_epi32(a, IDX); \
CHECK_RESULT(VALIDATE_INT32_M128(c, d##IDX))
#define TEST_IMPL(IDX) \
_d[0] = _a[((IDX) & 0x3)]; \
_d[1] = _a[((IDX >> 2) & 0x3)]; \
_d[2] = _a[((IDX >> 4) & 0x3)]; \
_d[3] = _a[((IDX >> 6) & 0x3)]; \
\
a = load_m128i(_a); \
c = _mm_shuffle_epi32(a, IDX); \
CHECK_RESULT(VALIDATE_INT32_M128(c, _d))

IMM_256_ITER
#undef TEST_IMPL
return TEST_SUCCESS;
#endif
}

result_t test_mm_shuffle_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
Expand All @@ -5883,60 +5906,72 @@ result_t test_mm_shuffle_pd(const SSE2NEONTestImpl &impl, uint32_t iter)

result_t test_mm_shufflehi_epi16(const SSE2NEONTestImpl &impl, uint32_t iter)
{
#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \
(__GNUC__ == 9 && __GNUC_MINOR__ == 2)
// gcc-8.3 would cause operand mismatch error here
return TEST_UNIMPL;
#else
const int16_t *_a = (const int16_t *) impl.mTestIntPointer1;
__m128i a, c;

#define TEST_IMPL(IDX) \
int16_t d##IDX[8]; \
d##IDX[0] = _a[0]; \
d##IDX[1] = _a[1]; \
d##IDX[2] = _a[2]; \
d##IDX[3] = _a[3]; \
d##IDX[4] = (int16_t) (((const int64_t *) _a)[1] >> ((IDX & 0x3) * 16)); \
d##IDX[5] = \
(int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 2) & 0x3) * 16)); \
d##IDX[6] = \
(int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 4) & 0x3) * 16)); \
d##IDX[7] = \
(int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 6) & 0x3) * 16)); \
\
a = load_m128i(_a); \
c = _mm_shufflehi_epi16(a, IDX); \
\
CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX))
int16_t _d[8];
#define TEST_IMPL(IDX) \
_d[0] = _a[0]; \
_d[1] = _a[1]; \
_d[2] = _a[2]; \
_d[3] = _a[3]; \
_d[4] = (int16_t) (((const int64_t *) _a)[1] >> ((IDX & 0x3) * 16)); \
_d[5] = \
(int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 2) & 0x3) * 16)); \
_d[6] = \
(int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 4) & 0x3) * 16)); \
_d[7] = \
(int16_t) (((const int64_t *) _a)[1] >> (((IDX >> 6) & 0x3) * 16)); \
\
a = load_m128i(_a); \
c = _mm_shufflehi_epi16(a, IDX); \
\
CHECK_RESULT(VALIDATE_INT16_M128(c, _d))

IMM_256_ITER
#undef TEST_IMPL
return TEST_SUCCESS;
#endif
}

result_t test_mm_shufflelo_epi16(const SSE2NEONTestImpl &impl, uint32_t iter)
{
#if (__GNUC__ == 8 && __GNUC_MINOR__ == 3) || \
(__GNUC__ == 9 && __GNUC_MINOR__ == 2)
// gcc-8.3 would cause operand mismatch error here
return TEST_UNIMPL;
#else
const int16_t *_a = (const int16_t *) impl.mTestIntPointer1;
__m128i a, c;

#define TEST_IMPL(IDX) \
int16_t d##IDX[8]; \
d##IDX[0] = (int16_t) (((const int64_t *) _a)[0] >> ((IDX & 0x3) * 16)); \
d##IDX[1] = \
(int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 2) & 0x3) * 16)); \
d##IDX[2] = \
(int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 4) & 0x3) * 16)); \
d##IDX[3] = \
(int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 6) & 0x3) * 16)); \
d##IDX[4] = _a[4]; \
d##IDX[5] = _a[5]; \
d##IDX[6] = _a[6]; \
d##IDX[7] = _a[7]; \
\
a = load_m128i(_a); \
c = _mm_shufflelo_epi16(a, IDX); \
\
CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX))
int16_t _d[8];

#define TEST_IMPL(IDX) \
_d[0] = (int16_t) (((const int64_t *) _a)[0] >> ((IDX & 0x3) * 16)); \
_d[1] = \
(int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 2) & 0x3) * 16)); \
_d[2] = \
(int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 4) & 0x3) * 16)); \
_d[3] = \
(int16_t) (((const int64_t *) _a)[0] >> (((IDX >> 6) & 0x3) * 16)); \
_d[4] = _a[4]; \
_d[5] = _a[5]; \
_d[6] = _a[6]; \
_d[7] = _a[7]; \
\
a = load_m128i(_a); \
c = _mm_shufflelo_epi16(a, IDX); \
\
CHECK_RESULT(VALIDATE_INT16_M128(c, _d))

IMM_256_ITER
#undef TEST_IMPL
return TEST_SUCCESS;
#endif
}

result_t test_mm_sll_epi16(const SSE2NEONTestImpl &impl, uint32_t iter)
Expand Down

0 comments on commit 53cd846

Please sign in to comment.