From 0674f0655fdcb50423edde9c9a35450e8502074f Mon Sep 17 00:00:00 2001 From: clamchowder Date: Fri, 19 Jan 2024 11:15:40 -0800 Subject: [PATCH] save progress --- .../InstructionRateFunctions.asm | 373 +++++++++++++++++- mt_instructionrate/Project1.vcxproj | 4 + mt_instructionrate/x86_mt_instructionrate.c | 100 ++++- 3 files changed, 467 insertions(+), 10 deletions(-) diff --git a/mt_instructionrate/InstructionRateFunctions.asm b/mt_instructionrate/InstructionRateFunctions.asm index 75a4cf0..c27198b 100644 --- a/mt_instructionrate/InstructionRateFunctions.asm +++ b/mt_instructionrate/InstructionRateFunctions.asm @@ -4,8 +4,27 @@ bits 64 global sse_int32_add_test global sse_int32_mul_test +global sse_int64_add_test +global sse_int64_mul_test global avx2_int32_add_test global avx2_int32_mul_test +global avx2_int64_add_test +global avx2_int64_mul_test +global sse_fp32_add_test +global sse_fp32_mul_test +global sse_fp32_muladd_test +global avx_fp32_add_test +global avx_fp32_mul_test +global avx_fp32_muladd_test +global fp32_fma_test +global fp64_fma_test + +global sse_fp64_add_test +global sse_fp64_mul_test +global sse_fp64_muladd_test +global avx_fp64_add_test +global avx_fp64_mul_test +global avx_fp64_muladd_test sse_int32_add_test: movups xmm0, [rdx] @@ -26,6 +45,24 @@ sse_int32_add_test_loop: jg sse_int32_add_test_loop ret +sse_int64_add_test: + movups xmm0, [rdx] + movups xmm1, [rdx + 16] + movups xmm2, [rdx + 32] + movups xmm3, [rdx + 48] + movups xmm4, [rdx + 64] + movups xmm5, [rdx + 72] +sse_int64_add_test_loop: + paddq xmm0, xmm0 + paddq xmm1, xmm1 + paddq xmm2, xmm2 + paddq xmm3, xmm3 + paddq xmm4, xmm4 + paddq xmm5, xmm5 + sub rcx, 12 + jg sse_int64_add_test_loop + ret + sse_int32_mul_test: movups xmm0, [rdx] movups xmm1, [rdx + 16] @@ -41,10 +78,27 @@ sse_int32_mul_test_loop: pmulld xmm4, xmm4 pmulld xmm5, xmm5 sub rcx, 24 - cmp rcx, 0 jg sse_int32_mul_test_loop ret +sse_int64_mul_test: + movups xmm0, [rdx] + movups xmm1, [rdx + 16] + movups xmm2, [rdx + 32] + movups xmm3, [rdx + 48] + movups xmm4, [rdx + 64] + movups xmm5, [rdx + 72] +sse_int64_mul_test_loop: + pmuludq xmm0, xmm0 + pmuludq xmm1, xmm1 + pmuludq xmm2, xmm2 + pmuludq xmm3, xmm3 + pmuludq xmm4, xmm4 + pmuludq xmm5, xmm5 + sub rcx, 12 + jg sse_int64_mul_test_loop + ret + avx2_int32_add_test: vmovups ymm0, [rdx] vmovups ymm1, [rdx + 32] @@ -79,6 +133,321 @@ avx2_int32_mul_test_loop: vpmulld ymm4, ymm4, ymm4 vpmulld ymm5, ymm5, ymm5 sub rcx, 48 - cmp rcx, 0 jg avx2_int32_mul_test_loop + ret + +avx2_int64_add_test: + vmovups ymm0, [rdx] + vmovups ymm1, [rdx + 32] + vmovups ymm2, [rdx + 64] + vmovups ymm3, [rdx + 96] + vmovups ymm4, [rdx + 128] + vmovups ymm5, [rdx + 160] +avx2_int64_add_test_loop: + vpaddq ymm0, ymm0, ymm0 + vpaddq ymm1, ymm1, ymm1 + vpaddq ymm2, ymm2, ymm2 + vpaddq ymm3, ymm3, ymm3 + vpaddq ymm4, ymm4, ymm4 + vpaddq ymm5, ymm5, ymm5 + sub rcx, 24 + jg avx2_int64_add_test_loop + ret + +avx2_int64_mul_test: + vmovups ymm0, [rdx] + vmovups ymm1, [rdx + 32] + vmovups ymm2, [rdx + 64] + vmovups ymm3, [rdx + 96] + vmovups ymm4, [rdx + 128] + vmovups ymm5, [rdx + 160] +avx2_int64_mul_test_loop: + vpmuldq ymm0, ymm0, ymm0 + vpmuldq ymm1, ymm1, ymm1 + vpmuldq ymm2, ymm2, ymm2 + vpmuldq ymm3, ymm3, ymm3 + vpmuldq ymm4, ymm4, ymm4 + vpmuldq ymm5, ymm5, ymm5 + sub rcx, 24 + jg avx2_int64_mul_test_loop + ret + +sse_fp32_add_test: + movups xmm0, [rdx] + movups xmm1, [rdx + 16] + movups xmm2, [rdx + 32] + movups xmm3, [rdx + 48] + movups xmm4, [rdx + 64] + movups xmm5, [rdx + 72] +sse_fp32_add_test_loop: + addps xmm0, xmm0 + addps xmm1, xmm1 + addps xmm2, xmm2 + addps xmm3, xmm3 + addps xmm4, xmm4 + addps xmm5, xmm5 + sub rcx, 24 + jg sse_fp32_add_test_loop + ret + +sse_fp64_add_test: + movups xmm0, [rdx] + movups xmm1, [rdx + 16] + movups xmm2, [rdx + 32] + movups xmm3, [rdx + 48] + movups xmm4, [rdx + 64] + movups xmm5, [rdx + 72] +sse_fp64_add_test_loop: + addpd xmm0, xmm0 + addpd xmm1, xmm1 + addpd xmm2, xmm2 + addpd xmm3, xmm3 + addpd xmm4, xmm4 + addpd xmm5, xmm5 + sub rcx, 12 + jg sse_fp64_add_test_loop + ret + +sse_fp32_mul_test: + movups xmm0, [rdx] + movups xmm1, [rdx + 16] + movups xmm2, [rdx + 32] + movups xmm3, [rdx + 48] + movups xmm4, [rdx + 64] + movups xmm5, [rdx + 72] +sse_fp32_mul_test_loop: + mulps xmm0, xmm0 + mulps xmm1, xmm1 + mulps xmm2, xmm2 + mulps xmm3, xmm3 + mulps xmm4, xmm4 + mulps xmm5, xmm5 + sub rcx, 24 + jg sse_fp32_mul_test_loop + ret + +sse_fp64_mul_test: + movups xmm0, [rdx] + movups xmm1, [rdx + 16] + movups xmm2, [rdx + 32] + movups xmm3, [rdx + 48] + movups xmm4, [rdx + 64] + movups xmm5, [rdx + 72] +sse_fp64_mul_test_loop: + mulpd xmm0, xmm0 + mulpd xmm1, xmm1 + mulpd xmm2, xmm2 + mulpd xmm3, xmm3 + mulpd xmm4, xmm4 + mulpd xmm5, xmm5 + sub rcx, 12 + jg sse_fp64_mul_test_loop + ret + +sse_fp32_muladd_test: + movups xmm0, [rdx] + movups xmm1, [rdx + 16] + movups xmm2, [rdx + 32] + movups xmm3, [rdx + 48] + movups xmm4, [rdx + 64] + movups xmm5, [rdx + 72] +sse_fp32_muladd_test_loop: + mulps xmm0, xmm0 + addps xmm0, xmm0 + mulps xmm1, xmm1 + addps xmm1, xmm1 + mulps xmm2, xmm2 + addps xmm2, xmm2 + mulps xmm3, xmm3 + addps xmm3, xmm3 + mulps xmm4, xmm4 + addps xmm4, xmm4 + mulps xmm5, xmm5 + addps xmm5, xmm5 + sub rcx, 24 + jg sse_fp32_muladd_test_loop + ret + +sse_fp64_muladd_test: + movups xmm0, [rdx] + movups xmm1, [rdx + 16] + movups xmm2, [rdx + 32] + movups xmm3, [rdx + 48] + movups xmm4, [rdx + 64] + movups xmm5, [rdx + 72] +sse_fp64_muladd_test_loop: + mulpd xmm0, xmm0 + addpd xmm0, xmm0 + mulpd xmm1, xmm1 + addpd xmm1, xmm1 + mulpd xmm2, xmm2 + addpd xmm2, xmm2 + mulpd xmm3, xmm3 + addpd xmm3, xmm3 + mulpd xmm4, xmm4 + addpd xmm4, xmm4 + mulpd xmm5, xmm5 + addpd xmm5, xmm5 + sub rcx, 12 + jg sse_fp64_muladd_test_loop + ret + +avx_fp32_add_test: + vmovups ymm0, [rdx] + vmovups ymm1, [rdx + 32] + vmovups ymm2, [rdx + 64] + vmovups ymm3, [rdx + 96] + vmovups ymm4, [rdx + 128] + vmovups ymm5, [rdx + 160] +avx_fp32_add_test_loop: + vaddps ymm0, ymm0, ymm0 + vaddps ymm1, ymm1, ymm1 + vaddps ymm2, ymm2, ymm2 + vaddps ymm3, ymm3, ymm3 + vaddps ymm4, ymm4, ymm4 + vaddps ymm5, ymm5, ymm5 + sub rcx, 48 + jg avx_fp32_add_test_loop + ret + +avx_fp64_add_test: + vmovups ymm0, [rdx] + vmovups ymm1, [rdx + 32] + vmovups ymm2, [rdx + 64] + vmovups ymm3, [rdx + 96] + vmovups ymm4, [rdx + 128] + vmovups ymm5, [rdx + 160] +avx_fp64_add_test_loop: + vaddpd ymm0, ymm0, ymm0 + vaddpd ymm1, ymm1, ymm1 + vaddpd ymm2, ymm2, ymm2 + vaddpd ymm3, ymm3, ymm3 + vaddpd ymm4, ymm4, ymm4 + vaddpd ymm5, ymm5, ymm5 + sub rcx, 24 + jg avx_fp64_add_test_loop + ret + +avx_fp32_mul_test: + vmovups ymm0, [rdx] + vmovups ymm1, [rdx + 32] + vmovups ymm2, [rdx + 64] + vmovups ymm3, [rdx + 96] + vmovups ymm4, [rdx + 128] + vmovups ymm5, [rdx + 160] +avx_fp32_mul_test_loop: + vmulps ymm0, ymm0, ymm0 + vmulps ymm1, ymm1, ymm1 + vmulps ymm2, ymm2, ymm2 + vmulps ymm3, ymm3, ymm3 + vmulps ymm4, ymm4, ymm4 + vmulps ymm5, ymm5, ymm5 + sub rcx, 48 + jg avx_fp32_mul_test_loop + ret + +avx_fp64_mul_test: + vmovups ymm0, [rdx] + vmovups ymm1, [rdx + 32] + vmovups ymm2, [rdx + 64] + vmovups ymm3, [rdx + 96] + vmovups ymm4, [rdx + 128] + vmovups ymm5, [rdx + 160] +avx_fp64_mul_test_loop: + vmulpd ymm0, ymm0, ymm0 + vmulpd ymm1, ymm1, ymm1 + vmulpd ymm2, ymm2, ymm2 + vmulpd ymm3, ymm3, ymm3 + vmulpd ymm4, ymm4, ymm4 + vmulpd ymm5, ymm5, ymm5 + sub rcx, 24 + jg avx_fp64_mul_test_loop + ret + +avx_fp32_muladd_test: + vmovups ymm0, [rdx] + vmovups ymm1, [rdx + 32] + vmovups ymm2, [rdx + 64] + vmovups ymm3, [rdx + 96] + vmovups ymm4, [rdx + 128] + vmovups ymm5, [rdx + 160] +avx_fp32_muladd_test_loop: + vmulps ymm0, ymm0, ymm0 + vaddps ymm0, ymm0, ymm0 + vmulps ymm1, ymm1, ymm1 + vaddps ymm1, ymm1, ymm1 + vmulps ymm2, ymm2, ymm2 + vaddps ymm2, ymm2, ymm2 + vmulps ymm3, ymm3, ymm3 + vaddps ymm3, ymm3, ymm3 + vmulps ymm4, ymm4, ymm4 + vaddps ymm4, ymm4, ymm4 + vmulps ymm5, ymm5, ymm5 + vaddps ymm5, ymm5, ymm5 + sub rcx, 48 + jg avx_fp32_muladd_test_loop + ret + +avx_fp64_muladd_test: + vmovups ymm0, [rdx] + vmovups ymm1, [rdx + 32] + vmovups ymm2, [rdx + 64] + vmovups ymm3, [rdx + 96] + vmovups ymm4, [rdx + 128] + vmovups ymm5, [rdx + 160] +avx_fp64_muladd_test_loop: + vmulpd ymm0, ymm0, ymm0 + vaddpd ymm0, ymm0, ymm0 + vmulpd ymm1, ymm1, ymm1 + vaddpd ymm1, ymm1, ymm1 + vmulpd ymm2, ymm2, ymm2 + vaddpd ymm2, ymm2, ymm2 + vmulpd ymm3, ymm3, ymm3 + vaddpd ymm3, ymm3, ymm3 + vmulpd ymm4, ymm4, ymm4 + vaddpd ymm4, ymm4, ymm4 + vmulpd ymm5, ymm5, ymm5 + vaddpd ymm5, ymm5, ymm5 + sub rcx, 24 + jg avx_fp64_muladd_test_loop + ret + +fp32_fma_test: + vzeroall + vmovups ymm0, [rdx] + vmovups ymm1, ymm0 + vmovups ymm2, ymm0 + vmovups ymm3, ymm0 + vmovups ymm4, ymm0 + vmovups ymm5, ymm0 + vmovups ymm6, ymm0 +fp32_fma_test_loop: + vfmadd132ps ymm0, ymm0, ymm6 + vfmadd132ps ymm1, ymm1, ymm6 + vfmadd132ps ymm2, ymm2, ymm6 + vfmadd132ps ymm3, ymm3, ymm6 + vfmadd132ps ymm4, ymm4, ymm6 + vfmadd132ps ymm5, ymm5, ymm6 + sub rcx, 48 + jg fp32_fma_test_loop + ret + +fp64_fma_test: + vzeroall + vmovups ymm0, [rdx] + vmovups ymm1, ymm0 + vmovups ymm2, ymm0 + vmovups ymm3, ymm0 + vmovups ymm4, ymm0 + vmovups ymm5, ymm0 + vmovups ymm6, ymm0 +fp64_fma_test_loop: + vfmadd132pd ymm0, ymm0, ymm6 + vfmadd132pd ymm1, ymm1, ymm6 + vfmadd132pd ymm2, ymm2, ymm6 + vfmadd132pd ymm3, ymm3, ymm6 + vfmadd132pd ymm4, ymm4, ymm6 + vfmadd132pd ymm5, ymm5, ymm6 + sub rcx, 24 + jg fp64_fma_test_loop ret \ No newline at end of file diff --git a/mt_instructionrate/Project1.vcxproj b/mt_instructionrate/Project1.vcxproj index 9a1cc83..df271e3 100644 --- a/mt_instructionrate/Project1.vcxproj +++ b/mt_instructionrate/Project1.vcxproj @@ -141,6 +141,10 @@ nasm -f win64 InstructionRateFunctions.asm Building asm functions InstructionRateFunctions.obj + false + nasm -f win64 InstructionRateFunctions.asm + InstructionRateFunctions.obj + Building asm functions diff --git a/mt_instructionrate/x86_mt_instructionrate.c b/mt_instructionrate/x86_mt_instructionrate.c index 816621c..ec94540 100644 --- a/mt_instructionrate/x86_mt_instructionrate.c +++ b/mt_instructionrate/x86_mt_instructionrate.c @@ -1,7 +1,25 @@ extern uint64_t sse_int32_add_test(uint64_t iterations, void *data) SMALLKITTEN; extern uint64_t sse_int32_mul_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t sse_int64_add_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t sse_int64_mul_test(uint64_t iterations, void* data) SMALLKITTEN; extern uint64_t avx2_int32_add_test(uint64_t iterations, void* data) SMALLKITTEN; extern uint64_t avx2_int32_mul_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t avx2_int64_add_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t avx2_int64_mul_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t sse_fp32_add_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t sse_fp32_mul_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t sse_fp32_muladd_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t avx_fp32_add_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t avx_fp32_mul_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t avx_fp32_muladd_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t sse_fp64_add_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t sse_fp64_mul_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t sse_fp64_muladd_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t avx_fp64_add_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t avx_fp64_mul_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t avx_fp64_muladd_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t fp32_fma_test(uint64_t iterations, void* data) SMALLKITTEN; +extern uint64_t fp64_fma_test(uint64_t iterations, void* data) SMALLKITTEN; #ifndef _MSC_VER #include @@ -18,14 +36,18 @@ void __cpuidex(int *data, int function, int subfunction) { void RunTests() { int cpuid_data[4]; - int avx_supported = 0, avx2_supported = 0, avx512_supported = 0; + int avx_supported = 0, avx2_supported = 0, avx512_supported = 0, fma_supported = 0; uint64_t iterations = 5500000000; int testDataLength = 512; uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength); - uint32_t *fpTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength); + uint64_t* int64TestArr = (uint64_t*)malloc(sizeof(uint64_t) * testDataLength); + float *fpTestArr = (float *)malloc(sizeof(uint32_t) * testDataLength); + double* fp64TestArr = (double*)malloc(sizeof(double) * testDataLength); for (int i = 0; i < testDataLength; i++) { intTestArr[i] = i; - fpTestArr[i] = i * 1.2f; + int64TestArr[i] = i * 2; + fpTestArr[i] = 1.0f + 0.02f * i; + fp64TestArr[i] = 2.0f + 0.01f * i; } __cpuidex(cpuid_data, 1, 0); @@ -39,6 +61,11 @@ void RunTests() { avx2_supported = 1; } + if (cpuid_data[2] & (1UL << 12)) { + fprintf(stderr, "FMA supported\n"); + fma_supported = 1; + } + __cpuidex(cpuid_data, 7, 0); if (cpuid_data[1] & (1UL << 16)) { fprintf(stderr, "AVX512 supported\n"); @@ -47,20 +74,77 @@ void RunTests() { fprintf(stderr, "Measuring INT32 adds with SSE\n"); float sseInt32Adds = measureFunction(iterations, sse_int32_add_test, intTestArr); - - float avx2Int32Adds; - if (avx2_supported) avx2Int32Adds = measureFunction(iterations, avx2_int32_add_test, intTestArr); float sseInt32Muls = measureFunction(iterations, sse_int32_mul_test, intTestArr); + float sseInt64Adds = measureFunction(iterations, sse_int64_add_test, intTestArr); + float sseInt64Muls = measureFunction(iterations, sse_int64_mul_test, intTestArr); + float sseFp32Adds = measureFunction(iterations, sse_fp32_add_test, fpTestArr); + float sseFp32Muls = measureFunction(iterations, sse_fp32_mul_test, fpTestArr); + float sseFp32MulAdds = measureFunction(iterations, sse_fp32_muladd_test, fpTestArr); + float sseFp64Adds = measureFunction(iterations, sse_fp64_add_test, fp64TestArr); + float sseFp64Muls = measureFunction(iterations, sse_fp64_mul_test, fp64TestArr); + float sseFp64Muladds = measureFunction(iterations, sse_fp64_muladd_test, fp64TestArr); + + float avx2Int32Adds, avx2Int32Muls, avx2Int64Adds, avx2Int64Muls; + if (avx2_supported) { + avx2Int32Adds = measureFunction(iterations, avx2_int32_add_test, intTestArr); + avx2Int32Muls = measureFunction(iterations, avx2_int32_mul_test, intTestArr); + avx2Int64Adds = measureFunction(iterations, avx2_int64_add_test, int64TestArr); + avx2Int64Muls = measureFunction(iterations, avx2_int64_mul_test, int64TestArr); + } + + float avxFp32Adds, avxFp32Muls, avxFp32Muladds, avxFp64Adds, avxFp64Muls, avxFp64Muladds; + if (avx_supported) + { + avxFp32Adds = measureFunction(iterations, avx_fp32_add_test, fpTestArr); + avxFp32Muls = measureFunction(iterations, avx_fp32_mul_test, fpTestArr); + avxFp32Muladds = measureFunction(iterations, avx_fp32_muladd_test, fpTestArr); + avxFp64Adds = measureFunction(iterations, avx_fp64_add_test, fp64TestArr); + avxFp64Muls = measureFunction(iterations, avx_fp64_mul_test, fp64TestArr); + avxFp64Muladds = measureFunction(iterations, avx_fp64_muladd_test, fp64TestArr); + } - float avx2Int32Muls; - if (avx2_supported) avx2Int32Muls = measureFunction(iterations, avx2_int32_mul_test, intTestArr); + float fmaFp32, fmaFp64; + if (fma_supported) { + fmaFp32 = measureFunction(iterations, fp32_fma_test, fpTestArr); + fmaFp64 = measureFunction(iterations, fp64_fma_test, fpTestArr); + } printf("-----GOPS/s-----\n"); + + // INT32 + printf("\n-----INT32-----\n"); printf("SSE INT32 Adds: %f\n", sseInt32Adds); if (avx2_supported) printf("AVX2 INT32 Adds: %f\n", avx2Int32Adds); printf("SSE INT32 Multiplies: %f\n", sseInt32Muls); if (avx2_supported) printf("AVX2 INT32 Multiplies: %f\n", avx2Int32Muls); + + // FP32 + printf("\n-----FP32-----\n"); + printf("SSE FP32 Adds: %f\n", sseFp32Adds); + if (avx_supported) printf("AVX FP32 Adds: %f\n", avxFp32Adds); + printf("SSE FP32 Multiplies: %f\n", sseFp32Muls); + if (avx_supported) printf("AVX FP32 Multiplies: %f\n", avxFp32Muls); + printf("SSE FP32 Multiply+Adds: %f\n", sseFp32MulAdds); + if (avx_supported) printf("AVX FP32 Multiply+Adds: %f (%f GFLOPS)\n", avxFp32Muladds, 2 * avxFp32Muladds); + if (fma_supported) printf("FP32 FMAs: %f (%f GFLOPS)\n", fmaFp32, 2 * fmaFp32); + // INT64 + printf("\n-----INT64-----\n"); + printf("SSE INT64 Adds: %f\n", sseInt64Adds); + if (avx2_supported) printf("AVX2 INT64 Adds: %f\n", avx2Int64Adds); + printf("SSE INT64 Multiplies: %f\n", sseInt64Muls); + if (avx2_supported) printf("AVX2 INT64 Multiplies: %f\n", avx2Int64Muls); + + // FP64 + printf("\n-----FP64-----\n"); + printf("SSE FP64 Adds: %f\n", sseFp64Adds); + if (avx_supported) printf("AVX FP64 Adds: %f\n", avxFp64Adds); + printf("SSE FP64 Multiplies: %f\n", sseFp64Muls); + if (avx_supported) printf("AVX FP64 Multiplies: %f\n", avxFp64Muls); + printf("SSE FP64 Multiply+Adds: %f (%f GFLOPS)\n", sseFp64Muladds, 2 * sseFp64Muladds); + if (avx_supported) printf("AVX FP64 Multiply+Adds: %f (%f GFLOPS)\n", avxFp64Muladds, 2 * avxFp64Muladds); + if (fma_supported) printf("AVX FP64 FMAs: %f (%f GFLOPS)\n", fmaFp64, 2 * fmaFp64); + free(intTestArr); free(fpTestArr); return;