diff --git a/mt_instructionrate/Makefile b/mt_instructionrate/Makefile index bea3a91..ace9860 100644 --- a/mt_instructionrate/Makefile +++ b/mt_instructionrate/Makefile @@ -2,3 +2,5 @@ x86: gcc -pthread -masm=intel x86_mt_instructionrate.s mt_instructionrate.c ../Common/timing.c -o x86_mt_instructionrate aarch64: gcc -pthread mt_instructionrate.c arm_mt_instructionrate.s ../Common/timing.c -o arm_mt_instructionrate +ppc64: + gcc -pthread -mregnames mt_instructionrate.c ppc64_mt_instructionrate.s ../Common/timing.c -o ppc64_mt_instructionrate diff --git a/mt_instructionrate/arm_mt_instructionrate b/mt_instructionrate/arm_mt_instructionrate deleted file mode 100755 index c9470ac..0000000 Binary files a/mt_instructionrate/arm_mt_instructionrate and /dev/null differ diff --git a/mt_instructionrate/mt_instructionrate.c b/mt_instructionrate/mt_instructionrate.c index 89abd60..7829240 100644 --- a/mt_instructionrate/mt_instructionrate.c +++ b/mt_instructionrate/mt_instructionrate.c @@ -45,6 +45,11 @@ int *coreList = NULL; #include "x86_mt_instructionrate.c" #endif + +#ifdef __PPC64__ +#include "ppc64_mt_instructionrate.c" +#endif + int main(int argc, char *argv[]) { char parseBuffer[512]; int parseIndices[64]; @@ -111,7 +116,6 @@ float measureFunction(uint64_t baseIterations, uint64_t (*testFunc)(uint64_t, vo pthread_t* testThreads = (pthread_t*)malloc(threadCount * sizeof(pthread_t)); #else HANDLE* testThreads = (HANDLE*)malloc(threadCount * sizeof(HANDLE)); - DWORD* tids = (DWORD*)malloc(threadCount * sizeof(DWORD)); #endif do { diff --git a/mt_instructionrate/ppc64_mt_instructionrate.c b/mt_instructionrate/ppc64_mt_instructionrate.c new file mode 100644 index 0000000..3cdf78f --- /dev/null +++ b/mt_instructionrate/ppc64_mt_instructionrate.c @@ -0,0 +1,40 @@ +extern uint64_t vec_int32_add_test(uint64_t iterations, void *data); +extern uint64_t vec_int32_mul_test(uint64_t iterations, void *data); +extern uint64_t vec_fp32_add_test(uint64_t iterations, void *data); +extern uint64_t vec_fp32_fma_test(uint64_t iterations, void *data); +extern uint64_t vec_fp32_isqrt_test(uint64_t iterations, void *data); +extern uint64_t fp64_add_test(uint64_t iterations, void *data); +extern uint64_t fp64_fma_test(uint64_t iterations, void *data); + +void RunTests() { + uint64_t iterations = 3500000000; + int testDataLength = 256; + uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength); + uint32_t *fpTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength); + for (int i = 0; i < testDataLength; i++) { + intTestArr[i] = i; + fpTestArr[i] = i * 1.2f; + } + + fprintf(stderr, "Measuring INT32 adds\n"); + float int32adds = measureFunction(iterations, vec_int32_add_test, intTestArr); + float int32muls = measureFunction(iterations, vec_int32_mul_test, intTestArr); + float fp32adds = measureFunction(iterations, vec_fp32_add_test, fpTestArr); + float fp32fmas = measureFunction(iterations, vec_fp32_fma_test, fpTestArr); + float fp32isqrt = measureFunction(iterations, vec_fp32_isqrt_test, fpTestArr); + float fp64adds = measureFunction(iterations, fp64_add_test, fpTestArr); + float fp64fmas = measureFunction(iterations, fp64_fma_test, fpTestArr); + + printf("-----GOPS/s-----\n"); + printf("Altivec INT32 Add: %f\n", int32adds); + printf("Altivec INT32 Multiply: %f\n", int32muls); + printf("Altivec FP32 Add: %f\n", fp32adds); + printf("Altivec FP32 FMA: %f (%f GFLOPS)\n", fp32fmas, 2 * fp32fmas); + printf("Altivec FP32 Inverse Square Root: %f\n", fp32isqrt); + printf("FP64 Add: %f\n", fp64adds); + printf("FP64 FMA: %f (%f GFLOPS)\n", fp64fmas, 2 * fp64fmas); + + free(intTestArr); + free(fpTestArr); + return; +} diff --git a/mt_instructionrate/ppc64_mt_instructionrate.s b/mt_instructionrate/ppc64_mt_instructionrate.s new file mode 100644 index 0000000..382834e --- /dev/null +++ b/mt_instructionrate/ppc64_mt_instructionrate.s @@ -0,0 +1,224 @@ +.text + +.global vec_int32_add_test +.global vec_int32_mul_test +.global vec_fp32_add_test +.global vec_fp32_fma_test +.global vec_fp32_isqrt_test +.global fp64_add_test +.global fp64_fma_test + +/* r3 = iterations, r4 = ptr to arr */ +vec_int32_add_test: + .quad .L.vec_int32_add_test,.TOC.@tocbase,0 +.L.vec_int32_add_test: + li r9, 0 + lvx v0, r4, r9 + li r9, 16 + lvx v1, r4, r9 + li r9, 32 + lvx v2, r4, r9 + li r9, 48 + lvx v3, r4, r9 + li r9, 64 + lvx v4, r4, r9 + li r9, 80 + lvx v5, r4, r9 + li r9, 0 +vec_int32_add_test_loop: + vadduwm v0, v0, v0 + vadduwm v1, v1, v1 + vadduwm v2, v2, v2 + vadduwm v3, v3, v3 + vadduwm v4, v4, v4 + vadduwm v5, v4, v4 + addi r9, r9, 24 + cmpld cr7, r3, r9 + bgt cr7, vec_int32_add_test_loop + blr + +vec_int32_mul_test: + .quad .L.vec_int32_mul_test,.TOC.@tocbase,0 +.L.vec_int32_mul_test: + li r9, 0 + lvx v0, r4, r9 + li r9, 16 + lvx v1, r4, r9 + li r9, 32 + lvx v2, r4, r9 + li r9, 48 + lvx v3, r4, r9 + li r9, 64 + lvx v4, r4, r9 + li r9, 80 + lvx v5, r4, r9 + li r9, 96 + lvx v6, r4, r9 + li r9, 128 + lvx v7, r4, r9 + li r9, 0 +vec_int32_mul_test_loop: + vmuleuh v0, v0, v0 + vmuleuh v1, v1, v1 + vmuleuh v2, v2, v2 + vmuleuh v3, v3, v3 + vmuleuh v4, v4, v4 + vmuleuh v5, v5, v5 + vmuleuh v6, v6, v6 + vmuleuh v7, v7, v7 + addi r9, r9, 32 + cmpld cr7, r3, r9 + bgt cr7, vec_int32_mul_test_loop + blr + +vec_fp32_add_test: + .quad .L.vec_fp32_add_test,.TOC.@tocbase,0 +.L.vec_fp32_add_test: + li r9, 0 + lvx v0, r4, r9 + li r9, 16 + lvx v1, r4, r9 + li r9, 32 + lvx v2, r4, r9 + li r9, 48 + lvx v3, r4, r9 + li r9, 64 + lvx v4, r4, r9 + li r9, 80 + lvx v5, r4, r9 + li r9, 96 + lvx v6, r4, r9 + li r9, 128 + lvx v7, r4, r9 + li r9, 0 +vec_fp32_add_test_loop: + vaddfp v0, v0, v0 + vaddfp v1, v1, v1 + vaddfp v2, v2, v2 + vaddfp v3, v3, v3 + vaddfp v4, v4, v4 + vaddfp v5, v5, v5 + vaddfp v6, v6, v6 + vaddfp v7, v7, v7 + addi r9, r9, 32 + cmpld cr7, r3, r9 + bgt cr7, vec_fp32_add_test_loop + blr + +vec_fp32_fma_test: + .quad .L.vec_fp32_fma_test,.TOC.@tocbase,0 +.L.vec_fp32_fma_test: + li r9, 0 + lvx v0, r4, r9 + li r9, 16 + lvx v1, r4, r9 + li r9, 32 + lvx v2, r4, r9 + li r9, 48 + lvx v3, r4, r9 + li r9, 64 + lvx v4, r4, r9 + li r9, 80 + lvx v5, r4, r9 + li r9, 96 + lvx v6, r4, r9 + li r9, 128 + lvx v7, r4, r9 + li r9, 0 +vec_fp32_fma_test_loop: + vmaddfp v0, v0, v0, v0 + vmaddfp v1, v1, v1, v1 + vmaddfp v2, v2, v2, v2 + vmaddfp v3, v3, v3, v3 + vmaddfp v4, v4, v4, v4 + vmaddfp v5, v5, v5, v5 + vmaddfp v6, v6, v6, v6 + vmaddfp v7, v7, v7, v7 + addi r9, r9, 32 + cmpld cr7, r3, r9 + bgt cr7, vec_fp32_add_test_loop + blr + +vec_fp32_isqrt_test: + .quad .L.vec_fp32_isqrt_test,.TOC.@tocbase,0 +.L.vec_fp32_isqrt_test: + li r9, 0 + lvx v0, r4, r9 + li r9, 16 + lvx v1, r4, r9 + li r9, 32 + lvx v2, r4, r9 + li r9, 48 + lvx v3, r4, r9 + li r9, 64 + lvx v4, r4, r9 + li r9, 80 + lvx v5, r4, r9 + li r9, 96 + lvx v6, r4, r9 + li r9, 128 + lvx v7, r4, r9 + li r9, 0 +vec_fp32_isqrt_test_loop: + vrsqrtefp v0, v0 + vrsqrtefp v1, v1 + vrsqrtefp v2, v2 + vrsqrtefp v3, v3 + vrsqrtefp v4, v4 + vrsqrtefp v5, v5 + vrsqrtefp v6, v6 + vrsqrtefp v7, v7 + addi r9, r9, 32 + cmpld cr7, r3, r9 + bgt cr7, vec_fp32_isqrt_test_loop + blr + +fp64_add_test: + .quad .L.fp64_add_test,.TOC.@tocbase,0 +.L.fp64_add_test: + lfd f0, 0(r4) + lfd f1, 8(r4) + lfd f2, 16(r4) + lfd f3, 24(r4) + lfd f4, 32(r4) + lfd f5, 40(r4) + lfd f6, 48(r4) + lfd f7, 56(r4) +fp64_add_test_loop: + fadd f0, f0, f0 + fadd f1, f1, f1 + fadd f2, f2, f2 + fadd f3, f3, f3 + fadd f4, f4, f4 + fadd f5, f5, f5 + fadd f6, f6, f6 + fadd f7, f7, f7 + addi r9, r9, 8 + cmpld cr7, r3, r9 + bgt cr7, fp64_add_test_loop + blr + +fp64_fma_test: + .quad .L.fp64_fma_test,.TOC.@tocbase,0 +.L.fp64_fma_test: + lfd f0, 0(r4) + lfd f1, 8(r4) + lfd f2, 16(r4) + lfd f3, 24(r4) + lfd f4, 32(r4) + lfd f5, 40(r4) + lfd f6, 48(r4) + lfd f7, 56(r4) +fp64_fma_test_loop: + fmadd f0, f0, f0, f0 + fmadd f1, f1, f1, f1 + fmadd f2, f2, f2, f2 + fmadd f3, f3, f3, f3 + fmadd f4, f4, f4, f4 + fmadd f5, f5, f5, f5 + fmadd f6, f6, f6, f6 + fmadd f7, f7, f7, f7 + addi r9, r9, 8 + cmpld cr7, r3, r9 + bgt cr7, fp64_fma_test_loop + blr diff --git a/mt_instructionrate/x86_mt_instructionrate b/mt_instructionrate/x86_mt_instructionrate deleted file mode 100755 index 2e64256..0000000 Binary files a/mt_instructionrate/x86_mt_instructionrate and /dev/null differ