From 4ae495c4d047d88184b7d9a7dc3dd2220f05bc15 Mon Sep 17 00:00:00 2001 From: clamchowder Date: Tue, 23 Jan 2024 05:20:21 +0000 Subject: [PATCH] avx512 mt irate --- .../InstructionRateFunctions.asm | 306 +++++++++++++++--- mt_instructionrate/x86_mt_instructionrate.c | 41 +++ 2 files changed, 299 insertions(+), 48 deletions(-) diff --git a/mt_instructionrate/InstructionRateFunctions.asm b/mt_instructionrate/InstructionRateFunctions.asm index c27198b..17e6699 100644 --- a/mt_instructionrate/InstructionRateFunctions.asm +++ b/mt_instructionrate/InstructionRateFunctions.asm @@ -13,9 +13,11 @@ global avx2_int64_mul_test global sse_fp32_add_test global sse_fp32_mul_test global sse_fp32_muladd_test +global sse_fp32_rsqrt_test global avx_fp32_add_test global avx_fp32_mul_test global avx_fp32_muladd_test +global avx_fp32_rsqrt_test global fp32_fma_test global fp64_fma_test @@ -26,13 +28,23 @@ global avx_fp64_add_test global avx_fp64_mul_test global avx_fp64_muladd_test +global avx512_int32_add_test +global avx512_int32_mul_test +global avx512_int64_add_test +global avx512_int64_mul_test +global avx512_fp32_rsqrt_test +global avx512_fp32_add_test +global avx512_fp32_fma_test +global avx512_fp64_add_test +global avx512_fp64_fma_test + sse_int32_add_test: - movups xmm0, [rdx] - movups xmm1, [rdx + 16] - movups xmm2, [rdx + 32] - movups xmm3, [rdx + 48] - movups xmm4, [rdx + 64] - movups xmm5, [rdx + 72] + movdqu xmm0, [rdx] + movdqu xmm1, [rdx + 16] + movdqu xmm2, [rdx + 32] + movdqu xmm3, [rdx + 48] + movdqu xmm4, [rdx + 64] + movdqu xmm5, [rdx + 72] sse_int32_add_test_loop: paddd xmm0, xmm0 paddd xmm1, xmm1 @@ -46,12 +58,12 @@ sse_int32_add_test_loop: ret sse_int64_add_test: - movups xmm0, [rdx] - movups xmm1, [rdx + 16] - movups xmm2, [rdx + 32] - movups xmm3, [rdx + 48] - movups xmm4, [rdx + 64] - movups xmm5, [rdx + 72] + movdqu xmm0, [rdx] + movdqu xmm1, [rdx + 16] + movdqu xmm2, [rdx + 32] + movdqu xmm3, [rdx + 48] + movdqu xmm4, [rdx + 64] + movdqu xmm5, [rdx + 72] sse_int64_add_test_loop: paddq xmm0, xmm0 paddq xmm1, xmm1 @@ -64,12 +76,12 @@ sse_int64_add_test_loop: ret sse_int32_mul_test: - movups xmm0, [rdx] - movups xmm1, [rdx + 16] - movups xmm2, [rdx + 32] - movups xmm3, [rdx + 48] - movups xmm4, [rdx + 64] - movups xmm5, [rdx + 72] + movdqu xmm0, [rdx] + movdqu xmm1, [rdx + 16] + movdqu xmm2, [rdx + 32] + movdqu xmm3, [rdx + 48] + movdqu xmm4, [rdx + 64] + movdqu xmm5, [rdx + 72] sse_int32_mul_test_loop: pmulld xmm0, xmm0 pmulld xmm1, xmm1 @@ -82,12 +94,12 @@ sse_int32_mul_test_loop: ret sse_int64_mul_test: - movups xmm0, [rdx] - movups xmm1, [rdx + 16] - movups xmm2, [rdx + 32] - movups xmm3, [rdx + 48] - movups xmm4, [rdx + 64] - movups xmm5, [rdx + 72] + movdqu xmm0, [rdx] + movdqu xmm1, [rdx + 16] + movdqu xmm2, [rdx + 32] + movdqu xmm3, [rdx + 48] + movdqu xmm4, [rdx + 64] + movdqu xmm5, [rdx + 72] sse_int64_mul_test_loop: pmuludq xmm0, xmm0 pmuludq xmm1, xmm1 @@ -100,12 +112,12 @@ sse_int64_mul_test_loop: ret avx2_int32_add_test: - vmovups ymm0, [rdx] - vmovups ymm1, [rdx + 32] - vmovups ymm2, [rdx + 64] - vmovups ymm3, [rdx + 96] - vmovups ymm4, [rdx + 128] - vmovups ymm5, [rdx + 160] + vmovdqu ymm0, [rdx] + vmovdqu ymm1, [rdx + 32] + vmovdqu ymm2, [rdx + 64] + vmovdqu ymm3, [rdx + 96] + vmovdqu ymm4, [rdx + 128] + vmovdqu ymm5, [rdx + 160] avx2_int32_add_test_loop: vpaddd ymm0, ymm0, ymm0 vpaddd ymm1, ymm1, ymm1 @@ -119,12 +131,12 @@ avx2_int32_add_test_loop: ret avx2_int32_mul_test: - vmovups ymm0, [rdx] - vmovups ymm1, [rdx + 32] - vmovups ymm2, [rdx + 64] - vmovups ymm3, [rdx + 96] - vmovups ymm4, [rdx + 128] - vmovups ymm5, [rdx + 160] + vmovdqu ymm0, [rdx] + vmovdqu ymm1, [rdx + 32] + vmovdqu ymm2, [rdx + 64] + vmovdqu ymm3, [rdx + 96] + vmovdqu ymm4, [rdx + 128] + vmovdqu ymm5, [rdx + 160] avx2_int32_mul_test_loop: vpmulld ymm0, ymm0, ymm0 vpmulld ymm1, ymm1, ymm1 @@ -137,12 +149,12 @@ avx2_int32_mul_test_loop: ret avx2_int64_add_test: - vmovups ymm0, [rdx] - vmovups ymm1, [rdx + 32] - vmovups ymm2, [rdx + 64] - vmovups ymm3, [rdx + 96] - vmovups ymm4, [rdx + 128] - vmovups ymm5, [rdx + 160] + vmovdqu ymm0, [rdx] + vmovdqu ymm1, [rdx + 32] + vmovdqu ymm2, [rdx + 64] + vmovdqu ymm3, [rdx + 96] + vmovdqu ymm4, [rdx + 128] + vmovdqu ymm5, [rdx + 160] avx2_int64_add_test_loop: vpaddq ymm0, ymm0, ymm0 vpaddq ymm1, ymm1, ymm1 @@ -155,12 +167,12 @@ avx2_int64_add_test_loop: ret avx2_int64_mul_test: - vmovups ymm0, [rdx] - vmovups ymm1, [rdx + 32] - vmovups ymm2, [rdx + 64] - vmovups ymm3, [rdx + 96] - vmovups ymm4, [rdx + 128] - vmovups ymm5, [rdx + 160] + vmovdqu ymm0, [rdx] + vmovdqu ymm1, [rdx + 32] + vmovdqu ymm2, [rdx + 64] + vmovdqu ymm3, [rdx + 96] + vmovdqu ymm4, [rdx + 128] + vmovdqu ymm5, [rdx + 160] avx2_int64_mul_test_loop: vpmuldq ymm0, ymm0, ymm0 vpmuldq ymm1, ymm1, ymm1 @@ -268,6 +280,42 @@ sse_fp32_muladd_test_loop: jg sse_fp32_muladd_test_loop ret +sse_fp32_rsqrt_test: + movups xmm0, [rdx] + movups xmm1, [rdx + 16] + movups xmm2, [rdx + 32] + movups xmm3, [rdx + 48] + movups xmm4, [rdx + 64] + movups xmm5, [rdx + 72] +sse_fp32_rsqrt_test_loop: + rsqrtps xmm0, xmm0 + rsqrtps xmm1, xmm1 + rsqrtps xmm2, xmm2 + rsqrtps xmm3, xmm3 + rsqrtps xmm4, xmm4 + rsqrtps xmm5, xmm5 + sub rcx, 24 + jg sse_fp32_rsqrt_test_loop + ret + +avx_fp32_rsqrt_test: + vmovups ymm0, [rdx] + vmovups ymm1, [rdx + 32] + vmovups ymm2, [rdx + 64] + vmovups ymm3, [rdx + 96] + vmovups ymm4, [rdx + 128] + vmovups ymm5, [rdx + 160] +avx_fp32_rsqrt_test_loop: + vrsqrtps ymm0, ymm0 + vrsqrtps ymm1, ymm1 + vrsqrtps ymm2, ymm2 + vrsqrtps ymm3, ymm3 + vrsqrtps ymm4, ymm4 + vrsqrtps ymm5, ymm5 + sub rcx, 48 + jg avx_fp32_rsqrt_test_loop + ret + sse_fp64_muladd_test: movups xmm0, [rdx] movups xmm1, [rdx + 16] @@ -450,4 +498,166 @@ fp64_fma_test_loop: vfmadd132pd ymm5, ymm5, ymm6 sub rcx, 24 jg fp64_fma_test_loop + ret + +avx512_int32_add_test: + vmovdqu16 zmm0, [rdx] + vmovdqu16 zmm1, [rdx + 64] + vmovdqu16 zmm2, [rdx + 128] + vmovdqu16 zmm3, [rdx + 192] + vmovdqu16 zmm4, [rdx + 256] + vmovdqu16 zmm5, [rdx + 384] +avx512_int32_add_test_loop: + vpaddd zmm0, zmm0, zmm0 + vpaddd zmm1, zmm1, zmm1 + vpaddd zmm2, zmm2, zmm2 + vpaddd zmm3, zmm3, zmm3 + vpaddd zmm4, zmm4, zmm4 + vpaddd zmm5, zmm5, zmm5 + sub rcx, 96 + jg avx512_int32_add_test_loop + ret + +avx512_int32_mul_test: + vmovdqu16 zmm0, [rdx] + vmovdqu16 zmm1, [rdx + 64] + vmovdqu16 zmm2, [rdx + 128] + vmovdqu16 zmm3, [rdx + 192] + vmovdqu16 zmm4, [rdx + 256] + vmovdqu16 zmm5, [rdx + 384] +avx512_int32_mul_test_loop: + vpmulld zmm0, zmm0, zmm0 + vpmulld zmm1, zmm1, zmm1 + vpmulld zmm2, zmm2, zmm2 + vpmulld zmm3, zmm3, zmm3 + vpmulld zmm4, zmm4, zmm4 + vpmulld zmm5, zmm5, zmm5 + sub rcx, 96 + jg avx512_int32_mul_test_loop + ret + +avx512_int64_add_test: + vmovdqu16 zmm0, [rdx] + vmovdqu16 zmm1, [rdx + 64] + vmovdqu16 zmm2, [rdx + 128] + vmovdqu16 zmm3, [rdx + 192] + vmovdqu16 zmm4, [rdx + 256] + vmovdqu16 zmm5, [rdx + 384] +avx512_int64_add_test_loop: + vpaddq zmm0, zmm0, zmm0 + vpaddq zmm1, zmm1, zmm1 + vpaddq zmm2, zmm2, zmm2 + vpaddq zmm3, zmm3, zmm3 + vpaddq zmm4, zmm4, zmm4 + vpaddq zmm5, zmm5, zmm5 + sub rcx, 48 + jg avx512_int64_add_test_loop + ret + +avx512_int64_mul_test: + vmovdqu16 zmm0, [rdx] + vmovdqu16 zmm1, [rdx + 64] + vmovdqu16 zmm2, [rdx + 128] + vmovdqu16 zmm3, [rdx + 192] + vmovdqu16 zmm4, [rdx + 256] + vmovdqu16 zmm5, [rdx + 384] +avx512_int64_mul_test_loop: + vpmuldq zmm0, zmm0, zmm0 + vpmuldq zmm1, zmm1, zmm1 + vpmuldq zmm2, zmm2, zmm2 + vpmuldq zmm3, zmm3, zmm3 + vpmuldq zmm4, zmm4, zmm4 + vpmuldq zmm5, zmm5, zmm5 + sub rcx, 48 + jg avx512_int64_mul_test_loop + ret + +avx512_fp32_rsqrt_test: + vmovups zmm0, [rdx] + vmovups zmm1, [rdx + 64] + vmovups zmm2, [rdx + 128] + vmovups zmm3, [rdx + 192] + vmovups zmm4, [rdx + 256] + vmovups zmm5, [rdx + 384] +avx512_fp32_rsqrt_test_loop: + vrsqrt14ps zmm0, zmm0 + vrsqrt14ps zmm1, zmm1 + vrsqrt14ps zmm2, zmm2 + vrsqrt14ps zmm3, zmm3 + vrsqrt14ps zmm4, zmm4 + vrsqrt14ps zmm5, zmm5 + sub rcx, 96 + jg avx512_fp32_rsqrt_test_loop + ret + +avx512_fp32_add_test: + vmovups zmm0, [rdx] + vmovups zmm1, [rdx + 64] + vmovups zmm2, [rdx + 128] + vmovups zmm3, [rdx + 192] + vmovups zmm4, [rdx + 256] + vmovups zmm5, [rdx + 384] +avx512_fp32_add_test_loop: + vaddps zmm0, zmm0, zmm0 + vaddps zmm1, zmm1, zmm1 + vaddps zmm2, zmm2, zmm2 + vaddps zmm3, zmm3, zmm3 + vaddps zmm4, zmm4, zmm4 + vaddps zmm5, zmm5, zmm5 + sub rcx, 96 + jg avx512_fp32_add_test_loop + ret + +avx512_fp32_fma_test: + vmovups zmm0, [rdx] + vmovups zmm1, [rdx + 64] + vmovups zmm2, [rdx + 128] + vmovups zmm3, [rdx + 192] + vmovups zmm4, [rdx + 256] + vmovups zmm5, [rdx + 384] +avx512_fp32_fma_test_loop: + vfmadd132ps zmm0, zmm0, zmm0 + vfmadd132ps zmm1, zmm1, zmm1 + vfmadd132ps zmm2, zmm2, zmm2 + vfmadd132ps zmm3, zmm3, zmm3 + vfmadd132ps zmm4, zmm4, zmm4 + vfmadd132ps zmm5, zmm5, zmm5 + sub rcx, 96 + jg avx512_fp32_fma_test_loop + ret + +avx512_fp64_add_test: + vmovups zmm0, [rdx] + vmovups zmm1, [rdx + 64] + vmovups zmm2, [rdx + 128] + vmovups zmm3, [rdx + 192] + vmovups zmm4, [rdx + 256] + vmovups zmm5, [rdx + 384] +avx512_fp64_add_test_loop: + vfmadd132pd zmm0, zmm0, zmm0 + vfmadd132pd zmm1, zmm1, zmm1 + vfmadd132pd zmm2, zmm2, zmm2 + vfmadd132pd zmm3, zmm3, zmm3 + vfmadd132pd zmm4, zmm4, zmm4 + vfmadd132pd zmm5, zmm5, zmm5 + sub rcx, 48 + jg avx512_fp64_add_test_loop + ret + +avx512_fp64_fma_test: + vmovups zmm0, [rdx] + vmovups zmm1, [rdx + 64] + vmovups zmm2, [rdx + 128] + vmovups zmm3, [rdx + 192] + vmovups zmm4, [rdx + 256] + vmovups zmm5, [rdx + 384] +avx512_fp64_fma_test_loop: + vfmadd132ps zmm0, zmm0, zmm0 + vfmadd132ps zmm1, zmm1, zmm1 + vfmadd132ps zmm2, zmm2, zmm2 + vfmadd132ps zmm3, zmm3, zmm3 + vfmadd132ps zmm4, zmm4, zmm4 + vfmadd132ps zmm5, zmm5, zmm5 + sub rcx, 48 + jg avx512_fp64_fma_test_loop ret \ No newline at end of file diff --git a/mt_instructionrate/x86_mt_instructionrate.c b/mt_instructionrate/x86_mt_instructionrate.c index ec94540..81ac133 100644 --- a/mt_instructionrate/x86_mt_instructionrate.c +++ b/mt_instructionrate/x86_mt_instructionrate.c @@ -21,6 +21,19 @@ extern uint64_t avx_fp64_muladd_test(uint64_t iterations, void* data) SMALLKITTE extern uint64_t fp32_fma_test(uint64_t iterations, void* data) SMALLKITTEN; extern uint64_t fp64_fma_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t avx_fp32_rsqrt_test(uint64_t iterations, void *data) SMALLKITTEN; +extern uint64_t sse_fp32_rsqrt_test(uint64_t iterations, void *data) SMALLKITTEN; + +extern uint64_t avx512_int32_add_test(uint64_t iterations, void *data) SMALLKITTEN; +extern uint64_t avx512_int32_mul_test(uint64_t iterations, void *data) SMALLKITTEN; +extern uint64_t avx512_int64_add_test(uint64_t iterations, void *data) SMALLKITTEN; +extern uint64_t avx512_int64_mul_test(uint64_t iterations, void *data) SMALLKITTEN; +extern uint64_t avx512_fp32_rsqrt_test(uint64_t iterations, void *data) SMALLKITTEN; +extern uint64_t avx512_fp32_add_test(uint64_t iterations, void *data) SMALLKITTEN; +extern uint64_t avx512_fp32_fma_test(uint64_t iterations, void *data) SMALLKITTEN; +extern uint64_t avx512_fp64_add_test(uint64_t iterations, void *data) SMALLKITTEN; +extern uint64_t avx512_fp64_fma_test(uint64_t iterations, void *data) SMALLKITTEN; + #ifndef _MSC_VER #include void __cpuidex(int *data, int function, int subfunction) { @@ -83,6 +96,7 @@ void RunTests() { float sseFp64Adds = measureFunction(iterations, sse_fp64_add_test, fp64TestArr); float sseFp64Muls = measureFunction(iterations, sse_fp64_mul_test, fp64TestArr); float sseFp64Muladds = measureFunction(iterations, sse_fp64_muladd_test, fp64TestArr); + float sseFp32Rsqrts = measureFunction(iterations, sse_fp32_rsqrt_test, fpTestArr); float avx2Int32Adds, avx2Int32Muls, avx2Int64Adds, avx2Int64Muls; if (avx2_supported) { @@ -93,6 +107,7 @@ void RunTests() { } float avxFp32Adds, avxFp32Muls, avxFp32Muladds, avxFp64Adds, avxFp64Muls, avxFp64Muladds; + float avxFp32Rsqrts; if (avx_supported) { avxFp32Adds = measureFunction(iterations, avx_fp32_add_test, fpTestArr); @@ -101,6 +116,7 @@ void RunTests() { avxFp64Adds = measureFunction(iterations, avx_fp64_add_test, fp64TestArr); avxFp64Muls = measureFunction(iterations, avx_fp64_mul_test, fp64TestArr); avxFp64Muladds = measureFunction(iterations, avx_fp64_muladd_test, fp64TestArr); + avxFp32Rsqrts = measureFunction(iterations, avx_fp32_rsqrt_test, fpTestArr); } float fmaFp32, fmaFp64; @@ -109,41 +125,66 @@ void RunTests() { fmaFp64 = measureFunction(iterations, fp64_fma_test, fpTestArr); } + float avx512Fp32Rsqrts, avx512Fp32Adds, avx512Fp32Fmas, avx512Fp64Adds, avx512Fp64Fmas; + float avx512Int32Adds, avx512Int32Muls, avx512Int64Adds, avx512Int64Muls; + if (avx512_supported) { + avx512Fp32Rsqrts = measureFunction(iterations, avx512_fp32_rsqrt_test, fpTestArr); + avx512Fp32Adds = measureFunction(iterations, avx512_fp32_add_test, fpTestArr); + avx512Fp32Fmas = measureFunction(iterations, avx512_fp32_fma_test, fpTestArr); + avx512Fp64Adds = measureFunction(iterations, avx512_fp64_add_test, fp64TestArr); + avx512Fp64Fmas = measureFunction(iterations, avx512_fp64_fma_test, fp64TestArr); + avx512Int32Adds = measureFunction(iterations, avx512_int32_add_test, intTestArr); + avx512Int32Muls = measureFunction(iterations, avx512_int32_mul_test, intTestArr); + avx512Int64Adds = measureFunction(iterations, avx512_int64_add_test, int64TestArr); + avx512Int64Muls = measureFunction(iterations, avx512_int64_mul_test, int64TestArr); + } + printf("-----GOPS/s-----\n"); // INT32 printf("\n-----INT32-----\n"); printf("SSE INT32 Adds: %f\n", sseInt32Adds); if (avx2_supported) printf("AVX2 INT32 Adds: %f\n", avx2Int32Adds); + if (avx512_supported) printf("AVX512 INT64 Adds: %f\n", avx512Int64Adds); printf("SSE INT32 Multiplies: %f\n", sseInt32Muls); if (avx2_supported) printf("AVX2 INT32 Multiplies: %f\n", avx2Int32Muls); + if (avx512_supported) printf("AVX512 INT64 Multiplies: %f\n", avx512Int64Muls); // FP32 printf("\n-----FP32-----\n"); printf("SSE FP32 Adds: %f\n", sseFp32Adds); if (avx_supported) printf("AVX FP32 Adds: %f\n", avxFp32Adds); + if (avx512_supported) printf("AVX512 FP32 Adds: %f\n", avx512Fp32Adds); printf("SSE FP32 Multiplies: %f\n", sseFp32Muls); if (avx_supported) printf("AVX FP32 Multiplies: %f\n", avxFp32Muls); printf("SSE FP32 Multiply+Adds: %f\n", sseFp32MulAdds); if (avx_supported) printf("AVX FP32 Multiply+Adds: %f (%f GFLOPS)\n", avxFp32Muladds, 2 * avxFp32Muladds); if (fma_supported) printf("FP32 FMAs: %f (%f GFLOPS)\n", fmaFp32, 2 * fmaFp32); + if (avx512_supported) printf("AVX512 FP32 FMAs: %f (%f GFLOPS)\n", avx512Fp32Fmas, avx512Fp32Fmas * 2); + printf("SSE FP32 Inverse Square Roots: %f\n", sseFp32Rsqrts); + if (avx_supported) printf("AVX FP32 Inverse Square Roots: %f\n", avxFp32Rsqrts); + if (avx512_supported) printf("AVX512 FP32 Inverse Square Roots: %f\n", avx512Fp32Rsqrts); // INT64 printf("\n-----INT64-----\n"); printf("SSE INT64 Adds: %f\n", sseInt64Adds); if (avx2_supported) printf("AVX2 INT64 Adds: %f\n", avx2Int64Adds); + if (avx512_supported) printf("AVX512 INT64 Adds: %f\n", avx512Int64Adds); printf("SSE INT64 Multiplies: %f\n", sseInt64Muls); if (avx2_supported) printf("AVX2 INT64 Multiplies: %f\n", avx2Int64Muls); + if (avx512_supported) printf("AVX512 INT64 Multiplies: %f\n", avx512Int64Muls); // FP64 printf("\n-----FP64-----\n"); printf("SSE FP64 Adds: %f\n", sseFp64Adds); if (avx_supported) printf("AVX FP64 Adds: %f\n", avxFp64Adds); + if (avx512_supported) printf("AVX512 FP64 Adds: %f\n", avx512Fp64Adds); printf("SSE FP64 Multiplies: %f\n", sseFp64Muls); if (avx_supported) printf("AVX FP64 Multiplies: %f\n", avxFp64Muls); printf("SSE FP64 Multiply+Adds: %f (%f GFLOPS)\n", sseFp64Muladds, 2 * sseFp64Muladds); if (avx_supported) printf("AVX FP64 Multiply+Adds: %f (%f GFLOPS)\n", avxFp64Muladds, 2 * avxFp64Muladds); if (fma_supported) printf("AVX FP64 FMAs: %f (%f GFLOPS)\n", fmaFp64, 2 * fmaFp64); + if (avx512_supported) printf("AVX512 FP64 FMAs: %f (%f GFLOPS)\n", avx512Fp64Fmas, avx512Fp64Fmas * 2); free(intTestArr); free(fpTestArr);