diff --git a/instructionrate/arm_instructionrate.c b/instructionrate/arm_instructionrate.c index af5b56b..abb9c5b 100644 --- a/instructionrate/arm_instructionrate.c +++ b/instructionrate/arm_instructionrate.c @@ -1,12 +1,19 @@ +#define _GNU_SOURCE #include #include #include #include +#include +#include +#include extern uint64_t noptest(uint64_t iterations); extern uint64_t clktest(uint64_t iterations); extern uint64_t addtest(uint64_t iterations); +extern uint64_t eortest(uint64_t iterations); +extern uint64_t maddaddtest(uint64_t iterations); +extern uint64_t cmptest(uint64_t iterations); extern uint64_t addmultest(uint64_t iterations); extern uint64_t addmul21test(uint64_t iterations); extern uint64_t mul32test(uint64_t iterations); @@ -111,6 +118,14 @@ int main(int argc, char *argv[]) { uint64_t iterationsHigh = iterations * 5; uint64_t time_diff_ms; float latency, opsPerNs, clockSpeedGhz; + + if (argc > 1) { + int targetCpu = atoi(argv[1]); + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(targetCpu, &cpuset); + sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); + } // figure out clock speed gettimeofday(&startTv, &startTz); @@ -120,62 +135,65 @@ int main(int argc, char *argv[]) { latency = 1e6 * (float)time_diff_ms / (float)iterations; // clk speed should be 1/latency, assuming we got one add per clk, roughly clockSpeedGhz = 1/latency; - printf("Estimated clock speed: %.2f GHz\n", clockSpeedGhz); - - printf("Adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addtest)); - printf("Nops per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest)); - printf("Indepdent movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, indepmovtest)); - printf("Dependent movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depmovtest)); - printf("eor -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, xorzerotest)); - printf("mov -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, movzerotest)); - printf("sub -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, subzerotest)); - - - printf("Not taken jmps per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, jmptest)); - printf("Jump fusion test: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fusejmptest)); - printf("1:1 mixed not taken jmps / muls per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest)); - printf("1:2 mixed not taken jmps / muls per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest21)); - printf("1:1 mixed not taken jmps / adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmptest)); - printf("1:2 mixed not taken jmps / adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmp21test)); - printf("1:1 mixed add/mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmultest)); - printf("2:1 mixed add/mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmul21test)); - printf("ror per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, rortest)); - printf("1:1 mixed mul/ror per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulrortest)); - printf("32-bit mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test)); - printf("64-bit mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test)); - printf("scalar fp32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, faddwrapper)); - printf("128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecadd128wrapper)); - printf("128-bit vec int32 multiply per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecmul128wrapper)); - printf("128-bit vec int32 mixed multiply and add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddmul128wrapper)); - printf("128-bit vec fp32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfadd128wrapper)); - printf("128-bit vec fp32 multiply per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfmul128wrapper)); - printf("128-bit vec fp32 mixed multiply and add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfmul128wrapper)); - printf("2:1 mixed scalar adds and 128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddvecadd128wrapper)); - printf("3:1 mixed scalar adds and 128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix3to1addvecadd128wrapper)); - printf("1:1 mixed scalar adds and 128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix1to1addvecadd128wrapper)); - printf("1:1 mixed scalar 32-bit multiply and 128-bit vec int32 multiply per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulvecmulwrapper)); - printf("1:1 mixed 128-bit vec fp32 multiply and 128-bit vec int32 multiply per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecmulfmulwrapper)); - printf("1:1 mixed 128-bit vec fp32 add and 128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddfaddwrapper)); - printf("1:2 mixed not taken jumps and 128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecaddwrapper)); - printf("1:1 mixed not taken jumps and 128-bit vec int32 mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecmulwrapper)); - printf("128-bit vec loads per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecloadwrapper)); - printf("128-bit vec stores per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecstorewrapper)); - printf("64-bit loads per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, loadwrapper)); - printf("1:1 mixed 64-bit loads/stores per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixloadstorewrapper)); - printf("2:1 mixed 64-bit loads/stores per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix21loadstorewrapper)); - printf("64-bit multiply latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul64test)); - printf("128-bit vec int32 add latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecadd128wrapper)); - printf("128-bit vec int32 mul latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecmul128wrapper)); - printf("Scalar FADD Latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latfaddwrapper)); - printf("128-bit vector FADD latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfadd128wrapper)); - printf("128-bit vector FMUL latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfmul128wrapper)); - - printf("128-bit vector FMA per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfma128wrapper)); - printf("128-bit vector FMA latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfma128wrapper)); - printf("Scalar FMA per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, scalarfmawrapper)); - printf("Scalar FMA latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latscalarfmawrapper)); - printf("1:1 mixed 128-bit vector FMA/FADD per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfma128wrapper)); - printf("1:1 mixed 128-bit vector FMA/FMUL per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfmulfma128wrapper)); + printf("Estimated clock speed> %.2f GHz\n", clockSpeedGhz); + + printf("Adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addtest)); + printf("XORs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, eortest)); + printf("CMPs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, cmptest)); + printf("1:3 madd:add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, maddaddtest)); + printf("Nops per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest)); + printf("Indepdent movs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, indepmovtest)); + printf("Dependent movs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depmovtest)); + printf("eor -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, xorzerotest)); + printf("mov -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, movzerotest)); + printf("sub -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, subzerotest)); + + + printf("Not taken jmps per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, jmptest)); + printf("Jump fusion test> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fusejmptest)); + printf("1:1 mixed not taken jmps / muls per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest)); + printf("1:2 mixed not taken jmps / muls per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest21)); + printf("1:1 mixed not taken jmps / adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmptest)); + printf("1:2 mixed not taken jmps / adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmp21test)); + printf("1:1 mixed add/mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmultest)); + printf("2:1 mixed add/mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmul21test)); + printf("ror per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, rortest)); + printf("1:1 mixed mul/ror per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulrortest)); + printf("32-bit mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test)); + printf("64-bit mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test)); + printf("scalar fp32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, faddwrapper)); + printf("128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecadd128wrapper)); + printf("128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecmul128wrapper)); + printf("128-bit vec int32 mixed multiply and add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddmul128wrapper)); + printf("128-bit vec fp32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfadd128wrapper)); + printf("128-bit vec fp32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfmul128wrapper)); + printf("128-bit vec fp32 mixed multiply and add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfmul128wrapper)); + printf("2:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddvecadd128wrapper)); + printf("3:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix3to1addvecadd128wrapper)); + printf("1:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix1to1addvecadd128wrapper)); + printf("1:1 mixed scalar 32-bit multiply and 128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulvecmulwrapper)); + printf("1:1 mixed 128-bit vec fp32 multiply and 128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecmulfmulwrapper)); + printf("1:1 mixed 128-bit vec fp32 add and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddfaddwrapper)); + printf("1:2 mixed not taken jumps and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecaddwrapper)); + printf("1:1 mixed not taken jumps and 128-bit vec int32 mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecmulwrapper)); + printf("128-bit vec loads per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecloadwrapper)); + printf("128-bit vec stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecstorewrapper)); + printf("64-bit loads per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, loadwrapper)); + printf("1:1 mixed 64-bit loads/stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixloadstorewrapper)); + printf("2:1 mixed 64-bit loads/stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix21loadstorewrapper)); + printf("64-bit multiply latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul64test)); + printf("128-bit vec int32 add latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecadd128wrapper)); + printf("128-bit vec int32 mul latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecmul128wrapper)); + printf("Scalar FADD Latency> %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latfaddwrapper)); + printf("128-bit vector FADD latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfadd128wrapper)); + printf("128-bit vector FMUL latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfmul128wrapper)); + + printf("128-bit vector FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfma128wrapper)); + printf("128-bit vector FMA latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfma128wrapper)); + printf("Scalar FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, scalarfmawrapper)); + printf("Scalar FMA latency> %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latscalarfmawrapper)); + printf("1:1 mixed 128-bit vector FMA/FADD per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfma128wrapper)); + printf("1:1 mixed 128-bit vector FMA/FMUL per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfmulfma128wrapper)); return 0; } diff --git a/instructionrate/arm_instructionrate.s b/instructionrate/arm_instructionrate.s index 00a2e95..ab365fe 100644 --- a/instructionrate/arm_instructionrate.s +++ b/instructionrate/arm_instructionrate.s @@ -2,6 +2,9 @@ .global clktest .global addtest +.global eortest +.global maddaddtest +.global cmptest .global addmultest .global addmul21test .global mixaddjmp21test @@ -192,8 +195,158 @@ addtest_loop: ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 + ret + +maddaddtest: + sub sp, sp, #0x50 + stp x14, x15, [sp, #0x10] + stp x12, x13, [sp, #0x20] + stp x10, x11, [sp, #0x30] + stp x8, x9, [sp, #0x40] + mov x15, 1 + mov x14, 20 + eor x13, x13, x13 + eor x12, x12, x12 + eor x11, x11, x11 + mov x10, 2 + eor x9, x9, x9 + mov x8, 3 +maddaddtest_loop: + add x13, x13, x15 + add x12, x12, x15 + add x11, x11, x15 + madd x10, x8, x0, x15 + add x13, x13, x15 + add x12, x12, x15 + add x11, x11, x15 + madd x10, x8, x0, x15 + add x13, x13, x15 + add x12, x12, x15 + add x11, x11, x15 + madd x10, x8, x0, x15 + add x13, x13, x15 + add x12, x12, x15 + add x11, x11, x15 + madd x10, x8, x0, x15 + add x13, x13, x15 + add x12, x12, x15 + add x11, x11, x15 + madd x10, x8, x0, x15 + sub x0, x0, x14 + cbnz x0, maddaddtest_loop + ldp x8, x9, [sp, #0x40] + ldp x10, x11, [sp, #0x30] + ldp x12, x13, [sp, #0x20] + ldp x14, x15, [sp, #0x10] + add sp, sp, #0x50 + ret + +eortest: + sub sp, sp, #0x50 + stp x14, x15, [sp, #0x10] + stp x12, x13, [sp, #0x20] + stp x10, x11, [sp, #0x30] + stp x8, x9, [sp, #0x40] + mov x15, 1 + mov x14, 30 + eor x13, x13, x13 + eor x12, x12, x12 + eor x11, x11, x11 + eor x10, x10, x10 + eor x9, x9, x9 +eortest_loop: + eor x13, x13, x15 + eor x12, x12, x15 + eor x11, x11, x15 + eor x10, x10, x15 + eor x9, x9, x15 + eor x13, x13, x15 + eor x12, x12, x15 + eor x11, x11, x15 + eor x10, x10, x15 + eor x9, x9, x15 + eor x13, x13, x15 + eor x12, x12, x15 + eor x11, x11, x15 + eor x10, x10, x15 + eor x9, x9, x15 + eor x13, x13, x15 + eor x12, x12, x15 + eor x11, x11, x15 + eor x10, x10, x15 + eor x9, x9, x15 + eor x13, x13, x15 + eor x12, x12, x15 + eor x11, x11, x15 + eor x10, x10, x15 + eor x9, x9, x15 + eor x13, x13, x15 + eor x12, x12, x15 + eor x11, x11, x15 + eor x10, x10, x15 + eor x9, x9, x15 + sub x0, x0, x14 + cbnz x0, eortest_loop + ldp x8, x9, [sp, #0x40] + ldp x10, x11, [sp, #0x30] + ldp x12, x13, [sp, #0x20] + ldp x14, x15, [sp, #0x10] + add sp, sp, #0x50 ret +cmptest: + sub sp, sp, #0x50 + stp x14, x15, [sp, #0x10] + stp x12, x13, [sp, #0x20] + stp x10, x11, [sp, #0x30] + stp x8, x9, [sp, #0x40] + mov x15, 1 + mov x14, 30 + eor x13, x13, x13 + eor x12, x12, x12 + eor x11, x11, x11 + eor x10, x10, x10 + eor x9, x9, x9 +cmptest_loop: + cmp x13, x13 + cmp x12, x12 + cmp x11, x11 + cmp x10, x10 + cmp x9, x9 + cmp x13, x13 + cmp x12, x12 + cmp x11, x11 + cmp x10, x10 + cmp x9, x9 + cmp x13, x13 + cmp x12, x12 + cmp x11, x11 + cmp x10, x10 + cmp x9, x9 + cmp x13, x13 + cmp x12, x12 + cmp x11, x11 + cmp x10, x10 + cmp x9, x9 + cmp x13, x13 + cmp x12, x12 + cmp x11, x11 + cmp x10, x10 + cmp x9, x9 + cmp x13, x13 + cmp x12, x12 + cmp x11, x11 + cmp x10, x10 + cmp x9, x9 + sub x0, x0, x14 + cbnz x0, cmptest_loop + ldp x8, x9, [sp, #0x40] + ldp x10, x11, [sp, #0x30] + ldp x12, x13, [sp, #0x20] + ldp x14, x15, [sp, #0x10] + add sp, sp, #0x50 + ret + addmultest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10]