diff --git a/mt_instructionrate/InstructionRateFunctions.asm b/mt_instructionrate/InstructionRateFunctions.asm
index 75a4cf0..c27198b 100644
--- a/mt_instructionrate/InstructionRateFunctions.asm
+++ b/mt_instructionrate/InstructionRateFunctions.asm
@@ -4,8 +4,27 @@ bits 64
global sse_int32_add_test
global sse_int32_mul_test
+global sse_int64_add_test
+global sse_int64_mul_test
global avx2_int32_add_test
global avx2_int32_mul_test
+global avx2_int64_add_test
+global avx2_int64_mul_test
+global sse_fp32_add_test
+global sse_fp32_mul_test
+global sse_fp32_muladd_test
+global avx_fp32_add_test
+global avx_fp32_mul_test
+global avx_fp32_muladd_test
+global fp32_fma_test
+global fp64_fma_test
+
+global sse_fp64_add_test
+global sse_fp64_mul_test
+global sse_fp64_muladd_test
+global avx_fp64_add_test
+global avx_fp64_mul_test
+global avx_fp64_muladd_test
sse_int32_add_test:
movups xmm0, [rdx]
@@ -26,6 +45,24 @@ sse_int32_add_test_loop:
jg sse_int32_add_test_loop
ret
+sse_int64_add_test:
+ movups xmm0, [rdx]
+ movups xmm1, [rdx + 16]
+ movups xmm2, [rdx + 32]
+ movups xmm3, [rdx + 48]
+ movups xmm4, [rdx + 64]
+ movups xmm5, [rdx + 72]
+sse_int64_add_test_loop:
+ paddq xmm0, xmm0
+ paddq xmm1, xmm1
+ paddq xmm2, xmm2
+ paddq xmm3, xmm3
+ paddq xmm4, xmm4
+ paddq xmm5, xmm5
+ sub rcx, 12
+ jg sse_int64_add_test_loop
+ ret
+
sse_int32_mul_test:
movups xmm0, [rdx]
movups xmm1, [rdx + 16]
@@ -41,10 +78,27 @@ sse_int32_mul_test_loop:
pmulld xmm4, xmm4
pmulld xmm5, xmm5
sub rcx, 24
- cmp rcx, 0
jg sse_int32_mul_test_loop
ret
+sse_int64_mul_test:
+ movups xmm0, [rdx]
+ movups xmm1, [rdx + 16]
+ movups xmm2, [rdx + 32]
+ movups xmm3, [rdx + 48]
+ movups xmm4, [rdx + 64]
+ movups xmm5, [rdx + 72]
+sse_int64_mul_test_loop:
+ pmuludq xmm0, xmm0
+ pmuludq xmm1, xmm1
+ pmuludq xmm2, xmm2
+ pmuludq xmm3, xmm3
+ pmuludq xmm4, xmm4
+ pmuludq xmm5, xmm5
+ sub rcx, 12
+ jg sse_int64_mul_test_loop
+ ret
+
avx2_int32_add_test:
vmovups ymm0, [rdx]
vmovups ymm1, [rdx + 32]
@@ -79,6 +133,321 @@ avx2_int32_mul_test_loop:
vpmulld ymm4, ymm4, ymm4
vpmulld ymm5, ymm5, ymm5
sub rcx, 48
- cmp rcx, 0
jg avx2_int32_mul_test_loop
+ ret
+
+avx2_int64_add_test:
+ vmovups ymm0, [rdx]
+ vmovups ymm1, [rdx + 32]
+ vmovups ymm2, [rdx + 64]
+ vmovups ymm3, [rdx + 96]
+ vmovups ymm4, [rdx + 128]
+ vmovups ymm5, [rdx + 160]
+avx2_int64_add_test_loop:
+ vpaddq ymm0, ymm0, ymm0
+ vpaddq ymm1, ymm1, ymm1
+ vpaddq ymm2, ymm2, ymm2
+ vpaddq ymm3, ymm3, ymm3
+ vpaddq ymm4, ymm4, ymm4
+ vpaddq ymm5, ymm5, ymm5
+ sub rcx, 24
+ jg avx2_int64_add_test_loop
+ ret
+
+avx2_int64_mul_test:
+ vmovups ymm0, [rdx]
+ vmovups ymm1, [rdx + 32]
+ vmovups ymm2, [rdx + 64]
+ vmovups ymm3, [rdx + 96]
+ vmovups ymm4, [rdx + 128]
+ vmovups ymm5, [rdx + 160]
+avx2_int64_mul_test_loop:
+ vpmuldq ymm0, ymm0, ymm0
+ vpmuldq ymm1, ymm1, ymm1
+ vpmuldq ymm2, ymm2, ymm2
+ vpmuldq ymm3, ymm3, ymm3
+ vpmuldq ymm4, ymm4, ymm4
+ vpmuldq ymm5, ymm5, ymm5
+ sub rcx, 24
+ jg avx2_int64_mul_test_loop
+ ret
+
+sse_fp32_add_test:
+ movups xmm0, [rdx]
+ movups xmm1, [rdx + 16]
+ movups xmm2, [rdx + 32]
+ movups xmm3, [rdx + 48]
+ movups xmm4, [rdx + 64]
+ movups xmm5, [rdx + 72]
+sse_fp32_add_test_loop:
+ addps xmm0, xmm0
+ addps xmm1, xmm1
+ addps xmm2, xmm2
+ addps xmm3, xmm3
+ addps xmm4, xmm4
+ addps xmm5, xmm5
+ sub rcx, 24
+ jg sse_fp32_add_test_loop
+ ret
+
+sse_fp64_add_test:
+ movups xmm0, [rdx]
+ movups xmm1, [rdx + 16]
+ movups xmm2, [rdx + 32]
+ movups xmm3, [rdx + 48]
+ movups xmm4, [rdx + 64]
+ movups xmm5, [rdx + 72]
+sse_fp64_add_test_loop:
+ addpd xmm0, xmm0
+ addpd xmm1, xmm1
+ addpd xmm2, xmm2
+ addpd xmm3, xmm3
+ addpd xmm4, xmm4
+ addpd xmm5, xmm5
+ sub rcx, 12
+ jg sse_fp64_add_test_loop
+ ret
+
+sse_fp32_mul_test:
+ movups xmm0, [rdx]
+ movups xmm1, [rdx + 16]
+ movups xmm2, [rdx + 32]
+ movups xmm3, [rdx + 48]
+ movups xmm4, [rdx + 64]
+ movups xmm5, [rdx + 72]
+sse_fp32_mul_test_loop:
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ sub rcx, 24
+ jg sse_fp32_mul_test_loop
+ ret
+
+sse_fp64_mul_test:
+ movups xmm0, [rdx]
+ movups xmm1, [rdx + 16]
+ movups xmm2, [rdx + 32]
+ movups xmm3, [rdx + 48]
+ movups xmm4, [rdx + 64]
+ movups xmm5, [rdx + 72]
+sse_fp64_mul_test_loop:
+ mulpd xmm0, xmm0
+ mulpd xmm1, xmm1
+ mulpd xmm2, xmm2
+ mulpd xmm3, xmm3
+ mulpd xmm4, xmm4
+ mulpd xmm5, xmm5
+ sub rcx, 12
+ jg sse_fp64_mul_test_loop
+ ret
+
+sse_fp32_muladd_test:
+ movups xmm0, [rdx]
+ movups xmm1, [rdx + 16]
+ movups xmm2, [rdx + 32]
+ movups xmm3, [rdx + 48]
+ movups xmm4, [rdx + 64]
+ movups xmm5, [rdx + 72]
+sse_fp32_muladd_test_loop:
+ mulps xmm0, xmm0
+ addps xmm0, xmm0
+ mulps xmm1, xmm1
+ addps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm2, xmm2
+ mulps xmm3, xmm3
+ addps xmm3, xmm3
+ mulps xmm4, xmm4
+ addps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm5, xmm5
+ sub rcx, 24
+ jg sse_fp32_muladd_test_loop
+ ret
+
+sse_fp64_muladd_test:
+ movups xmm0, [rdx]
+ movups xmm1, [rdx + 16]
+ movups xmm2, [rdx + 32]
+ movups xmm3, [rdx + 48]
+ movups xmm4, [rdx + 64]
+ movups xmm5, [rdx + 72]
+sse_fp64_muladd_test_loop:
+ mulpd xmm0, xmm0
+ addpd xmm0, xmm0
+ mulpd xmm1, xmm1
+ addpd xmm1, xmm1
+ mulpd xmm2, xmm2
+ addpd xmm2, xmm2
+ mulpd xmm3, xmm3
+ addpd xmm3, xmm3
+ mulpd xmm4, xmm4
+ addpd xmm4, xmm4
+ mulpd xmm5, xmm5
+ addpd xmm5, xmm5
+ sub rcx, 12
+ jg sse_fp64_muladd_test_loop
+ ret
+
+avx_fp32_add_test:
+ vmovups ymm0, [rdx]
+ vmovups ymm1, [rdx + 32]
+ vmovups ymm2, [rdx + 64]
+ vmovups ymm3, [rdx + 96]
+ vmovups ymm4, [rdx + 128]
+ vmovups ymm5, [rdx + 160]
+avx_fp32_add_test_loop:
+ vaddps ymm0, ymm0, ymm0
+ vaddps ymm1, ymm1, ymm1
+ vaddps ymm2, ymm2, ymm2
+ vaddps ymm3, ymm3, ymm3
+ vaddps ymm4, ymm4, ymm4
+ vaddps ymm5, ymm5, ymm5
+ sub rcx, 48
+ jg avx_fp32_add_test_loop
+ ret
+
+avx_fp64_add_test:
+ vmovups ymm0, [rdx]
+ vmovups ymm1, [rdx + 32]
+ vmovups ymm2, [rdx + 64]
+ vmovups ymm3, [rdx + 96]
+ vmovups ymm4, [rdx + 128]
+ vmovups ymm5, [rdx + 160]
+avx_fp64_add_test_loop:
+ vaddpd ymm0, ymm0, ymm0
+ vaddpd ymm1, ymm1, ymm1
+ vaddpd ymm2, ymm2, ymm2
+ vaddpd ymm3, ymm3, ymm3
+ vaddpd ymm4, ymm4, ymm4
+ vaddpd ymm5, ymm5, ymm5
+ sub rcx, 24
+ jg avx_fp64_add_test_loop
+ ret
+
+avx_fp32_mul_test:
+ vmovups ymm0, [rdx]
+ vmovups ymm1, [rdx + 32]
+ vmovups ymm2, [rdx + 64]
+ vmovups ymm3, [rdx + 96]
+ vmovups ymm4, [rdx + 128]
+ vmovups ymm5, [rdx + 160]
+avx_fp32_mul_test_loop:
+ vmulps ymm0, ymm0, ymm0
+ vmulps ymm1, ymm1, ymm1
+ vmulps ymm2, ymm2, ymm2
+ vmulps ymm3, ymm3, ymm3
+ vmulps ymm4, ymm4, ymm4
+ vmulps ymm5, ymm5, ymm5
+ sub rcx, 48
+ jg avx_fp32_mul_test_loop
+ ret
+
+avx_fp64_mul_test:
+ vmovups ymm0, [rdx]
+ vmovups ymm1, [rdx + 32]
+ vmovups ymm2, [rdx + 64]
+ vmovups ymm3, [rdx + 96]
+ vmovups ymm4, [rdx + 128]
+ vmovups ymm5, [rdx + 160]
+avx_fp64_mul_test_loop:
+ vmulpd ymm0, ymm0, ymm0
+ vmulpd ymm1, ymm1, ymm1
+ vmulpd ymm2, ymm2, ymm2
+ vmulpd ymm3, ymm3, ymm3
+ vmulpd ymm4, ymm4, ymm4
+ vmulpd ymm5, ymm5, ymm5
+ sub rcx, 24
+ jg avx_fp64_mul_test_loop
+ ret
+
+avx_fp32_muladd_test:
+ vmovups ymm0, [rdx]
+ vmovups ymm1, [rdx + 32]
+ vmovups ymm2, [rdx + 64]
+ vmovups ymm3, [rdx + 96]
+ vmovups ymm4, [rdx + 128]
+ vmovups ymm5, [rdx + 160]
+avx_fp32_muladd_test_loop:
+ vmulps ymm0, ymm0, ymm0
+ vaddps ymm0, ymm0, ymm0
+ vmulps ymm1, ymm1, ymm1
+ vaddps ymm1, ymm1, ymm1
+ vmulps ymm2, ymm2, ymm2
+ vaddps ymm2, ymm2, ymm2
+ vmulps ymm3, ymm3, ymm3
+ vaddps ymm3, ymm3, ymm3
+ vmulps ymm4, ymm4, ymm4
+ vaddps ymm4, ymm4, ymm4
+ vmulps ymm5, ymm5, ymm5
+ vaddps ymm5, ymm5, ymm5
+ sub rcx, 48
+ jg avx_fp32_muladd_test_loop
+ ret
+
+avx_fp64_muladd_test:
+ vmovups ymm0, [rdx]
+ vmovups ymm1, [rdx + 32]
+ vmovups ymm2, [rdx + 64]
+ vmovups ymm3, [rdx + 96]
+ vmovups ymm4, [rdx + 128]
+ vmovups ymm5, [rdx + 160]
+avx_fp64_muladd_test_loop:
+ vmulpd ymm0, ymm0, ymm0
+ vaddpd ymm0, ymm0, ymm0
+ vmulpd ymm1, ymm1, ymm1
+ vaddpd ymm1, ymm1, ymm1
+ vmulpd ymm2, ymm2, ymm2
+ vaddpd ymm2, ymm2, ymm2
+ vmulpd ymm3, ymm3, ymm3
+ vaddpd ymm3, ymm3, ymm3
+ vmulpd ymm4, ymm4, ymm4
+ vaddpd ymm4, ymm4, ymm4
+ vmulpd ymm5, ymm5, ymm5
+ vaddpd ymm5, ymm5, ymm5
+ sub rcx, 24
+ jg avx_fp64_muladd_test_loop
+ ret
+
+fp32_fma_test:
+ vzeroall
+ vmovups ymm0, [rdx]
+ vmovups ymm1, ymm0
+ vmovups ymm2, ymm0
+ vmovups ymm3, ymm0
+ vmovups ymm4, ymm0
+ vmovups ymm5, ymm0
+ vmovups ymm6, ymm0
+fp32_fma_test_loop:
+ vfmadd132ps ymm0, ymm0, ymm6
+ vfmadd132ps ymm1, ymm1, ymm6
+ vfmadd132ps ymm2, ymm2, ymm6
+ vfmadd132ps ymm3, ymm3, ymm6
+ vfmadd132ps ymm4, ymm4, ymm6
+ vfmadd132ps ymm5, ymm5, ymm6
+ sub rcx, 48
+ jg fp32_fma_test_loop
+ ret
+
+fp64_fma_test:
+ vzeroall
+ vmovups ymm0, [rdx]
+ vmovups ymm1, ymm0
+ vmovups ymm2, ymm0
+ vmovups ymm3, ymm0
+ vmovups ymm4, ymm0
+ vmovups ymm5, ymm0
+ vmovups ymm6, ymm0
+fp64_fma_test_loop:
+ vfmadd132pd ymm0, ymm0, ymm6
+ vfmadd132pd ymm1, ymm1, ymm6
+ vfmadd132pd ymm2, ymm2, ymm6
+ vfmadd132pd ymm3, ymm3, ymm6
+ vfmadd132pd ymm4, ymm4, ymm6
+ vfmadd132pd ymm5, ymm5, ymm6
+ sub rcx, 24
+ jg fp64_fma_test_loop
ret
\ No newline at end of file
diff --git a/mt_instructionrate/Project1.vcxproj b/mt_instructionrate/Project1.vcxproj
index 9a1cc83..df271e3 100644
--- a/mt_instructionrate/Project1.vcxproj
+++ b/mt_instructionrate/Project1.vcxproj
@@ -141,6 +141,10 @@
nasm -f win64 InstructionRateFunctions.asm
Building asm functions
InstructionRateFunctions.obj
+ false
+ nasm -f win64 InstructionRateFunctions.asm
+ InstructionRateFunctions.obj
+ Building asm functions
diff --git a/mt_instructionrate/x86_mt_instructionrate.c b/mt_instructionrate/x86_mt_instructionrate.c
index 816621c..ec94540 100644
--- a/mt_instructionrate/x86_mt_instructionrate.c
+++ b/mt_instructionrate/x86_mt_instructionrate.c
@@ -1,7 +1,25 @@
extern uint64_t sse_int32_add_test(uint64_t iterations, void *data) SMALLKITTEN;
extern uint64_t sse_int32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t sse_int64_add_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t sse_int64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx2_int32_add_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx2_int32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t avx2_int64_add_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t avx2_int64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t sse_fp32_add_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t sse_fp32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t sse_fp32_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t avx_fp32_add_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t avx_fp32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t avx_fp32_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t sse_fp64_add_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t sse_fp64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t sse_fp64_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t avx_fp64_add_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t avx_fp64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t avx_fp64_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t fp32_fma_test(uint64_t iterations, void* data) SMALLKITTEN;
+extern uint64_t fp64_fma_test(uint64_t iterations, void* data) SMALLKITTEN;
#ifndef _MSC_VER
#include
@@ -18,14 +36,18 @@ void __cpuidex(int *data, int function, int subfunction) {
void RunTests() {
int cpuid_data[4];
- int avx_supported = 0, avx2_supported = 0, avx512_supported = 0;
+ int avx_supported = 0, avx2_supported = 0, avx512_supported = 0, fma_supported = 0;
uint64_t iterations = 5500000000;
int testDataLength = 512;
uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);
- uint32_t *fpTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);
+ uint64_t* int64TestArr = (uint64_t*)malloc(sizeof(uint64_t) * testDataLength);
+ float *fpTestArr = (float *)malloc(sizeof(uint32_t) * testDataLength);
+ double* fp64TestArr = (double*)malloc(sizeof(double) * testDataLength);
for (int i = 0; i < testDataLength; i++) {
intTestArr[i] = i;
- fpTestArr[i] = i * 1.2f;
+ int64TestArr[i] = i * 2;
+ fpTestArr[i] = 1.0f + 0.02f * i;
+ fp64TestArr[i] = 2.0f + 0.01f * i;
}
__cpuidex(cpuid_data, 1, 0);
@@ -39,6 +61,11 @@ void RunTests() {
avx2_supported = 1;
}
+ if (cpuid_data[2] & (1UL << 12)) {
+ fprintf(stderr, "FMA supported\n");
+ fma_supported = 1;
+ }
+
__cpuidex(cpuid_data, 7, 0);
if (cpuid_data[1] & (1UL << 16)) {
fprintf(stderr, "AVX512 supported\n");
@@ -47,20 +74,77 @@ void RunTests() {
fprintf(stderr, "Measuring INT32 adds with SSE\n");
float sseInt32Adds = measureFunction(iterations, sse_int32_add_test, intTestArr);
-
- float avx2Int32Adds;
- if (avx2_supported) avx2Int32Adds = measureFunction(iterations, avx2_int32_add_test, intTestArr);
float sseInt32Muls = measureFunction(iterations, sse_int32_mul_test, intTestArr);
+ float sseInt64Adds = measureFunction(iterations, sse_int64_add_test, intTestArr);
+ float sseInt64Muls = measureFunction(iterations, sse_int64_mul_test, intTestArr);
+ float sseFp32Adds = measureFunction(iterations, sse_fp32_add_test, fpTestArr);
+ float sseFp32Muls = measureFunction(iterations, sse_fp32_mul_test, fpTestArr);
+ float sseFp32MulAdds = measureFunction(iterations, sse_fp32_muladd_test, fpTestArr);
+ float sseFp64Adds = measureFunction(iterations, sse_fp64_add_test, fp64TestArr);
+ float sseFp64Muls = measureFunction(iterations, sse_fp64_mul_test, fp64TestArr);
+ float sseFp64Muladds = measureFunction(iterations, sse_fp64_muladd_test, fp64TestArr);
+
+ float avx2Int32Adds, avx2Int32Muls, avx2Int64Adds, avx2Int64Muls;
+ if (avx2_supported) {
+ avx2Int32Adds = measureFunction(iterations, avx2_int32_add_test, intTestArr);
+ avx2Int32Muls = measureFunction(iterations, avx2_int32_mul_test, intTestArr);
+ avx2Int64Adds = measureFunction(iterations, avx2_int64_add_test, int64TestArr);
+ avx2Int64Muls = measureFunction(iterations, avx2_int64_mul_test, int64TestArr);
+ }
+
+ float avxFp32Adds, avxFp32Muls, avxFp32Muladds, avxFp64Adds, avxFp64Muls, avxFp64Muladds;
+ if (avx_supported)
+ {
+ avxFp32Adds = measureFunction(iterations, avx_fp32_add_test, fpTestArr);
+ avxFp32Muls = measureFunction(iterations, avx_fp32_mul_test, fpTestArr);
+ avxFp32Muladds = measureFunction(iterations, avx_fp32_muladd_test, fpTestArr);
+ avxFp64Adds = measureFunction(iterations, avx_fp64_add_test, fp64TestArr);
+ avxFp64Muls = measureFunction(iterations, avx_fp64_mul_test, fp64TestArr);
+ avxFp64Muladds = measureFunction(iterations, avx_fp64_muladd_test, fp64TestArr);
+ }
- float avx2Int32Muls;
- if (avx2_supported) avx2Int32Muls = measureFunction(iterations, avx2_int32_mul_test, intTestArr);
+ float fmaFp32, fmaFp64;
+ if (fma_supported) {
+ fmaFp32 = measureFunction(iterations, fp32_fma_test, fpTestArr);
+ fmaFp64 = measureFunction(iterations, fp64_fma_test, fpTestArr);
+ }
printf("-----GOPS/s-----\n");
+
+ // INT32
+ printf("\n-----INT32-----\n");
printf("SSE INT32 Adds: %f\n", sseInt32Adds);
if (avx2_supported) printf("AVX2 INT32 Adds: %f\n", avx2Int32Adds);
printf("SSE INT32 Multiplies: %f\n", sseInt32Muls);
if (avx2_supported) printf("AVX2 INT32 Multiplies: %f\n", avx2Int32Muls);
+
+ // FP32
+ printf("\n-----FP32-----\n");
+ printf("SSE FP32 Adds: %f\n", sseFp32Adds);
+ if (avx_supported) printf("AVX FP32 Adds: %f\n", avxFp32Adds);
+ printf("SSE FP32 Multiplies: %f\n", sseFp32Muls);
+ if (avx_supported) printf("AVX FP32 Multiplies: %f\n", avxFp32Muls);
+ printf("SSE FP32 Multiply+Adds: %f\n", sseFp32MulAdds);
+ if (avx_supported) printf("AVX FP32 Multiply+Adds: %f (%f GFLOPS)\n", avxFp32Muladds, 2 * avxFp32Muladds);
+ if (fma_supported) printf("FP32 FMAs: %f (%f GFLOPS)\n", fmaFp32, 2 * fmaFp32);
+ // INT64
+ printf("\n-----INT64-----\n");
+ printf("SSE INT64 Adds: %f\n", sseInt64Adds);
+ if (avx2_supported) printf("AVX2 INT64 Adds: %f\n", avx2Int64Adds);
+ printf("SSE INT64 Multiplies: %f\n", sseInt64Muls);
+ if (avx2_supported) printf("AVX2 INT64 Multiplies: %f\n", avx2Int64Muls);
+
+ // FP64
+ printf("\n-----FP64-----\n");
+ printf("SSE FP64 Adds: %f\n", sseFp64Adds);
+ if (avx_supported) printf("AVX FP64 Adds: %f\n", avxFp64Adds);
+ printf("SSE FP64 Multiplies: %f\n", sseFp64Muls);
+ if (avx_supported) printf("AVX FP64 Multiplies: %f\n", avxFp64Muls);
+ printf("SSE FP64 Multiply+Adds: %f (%f GFLOPS)\n", sseFp64Muladds, 2 * sseFp64Muladds);
+ if (avx_supported) printf("AVX FP64 Multiply+Adds: %f (%f GFLOPS)\n", avxFp64Muladds, 2 * avxFp64Muladds);
+ if (fma_supported) printf("AVX FP64 FMAs: %f (%f GFLOPS)\n", fmaFp64, 2 * fmaFp64);
+
free(intTestArr);
free(fpTestArr);
return;