diff --git a/instructionrate/arm_instructionrate.c b/instructionrate/arm_instructionrate.c
index af5b56b..abb9c5b 100644
--- a/instructionrate/arm_instructionrate.c
+++ b/instructionrate/arm_instructionrate.c
@@ -1,12 +1,19 @@
+#define  _GNU_SOURCE
 #include <stdio.h>
 #include <sys/time.h>
 #include <time.h>
 #include <stdint.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sched.h>
 
 extern uint64_t noptest(uint64_t iterations);
 extern uint64_t clktest(uint64_t iterations);
 
 extern uint64_t addtest(uint64_t iterations);
+extern uint64_t eortest(uint64_t iterations);
+extern uint64_t maddaddtest(uint64_t iterations);
+extern uint64_t cmptest(uint64_t iterations);
 extern uint64_t addmultest(uint64_t iterations);
 extern uint64_t addmul21test(uint64_t iterations);
 extern uint64_t mul32test(uint64_t iterations);
@@ -111,6 +118,14 @@ int main(int argc, char *argv[]) {
   uint64_t iterationsHigh = iterations * 5;
   uint64_t time_diff_ms;
   float latency, opsPerNs, clockSpeedGhz;
+  
+  if (argc > 1) {
+    int targetCpu = atoi(argv[1]);
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(targetCpu, &cpuset);
+    sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
+  }
 
   // figure out clock speed
   gettimeofday(&startTv, &startTz);
@@ -120,62 +135,65 @@ int main(int argc, char *argv[]) {
   latency = 1e6 * (float)time_diff_ms / (float)iterations;
   // clk speed should be 1/latency, assuming we got one add per clk, roughly
   clockSpeedGhz = 1/latency;
-  printf("Estimated clock speed: %.2f GHz\n", clockSpeedGhz);
-
-  printf("Adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addtest));
-  printf("Nops per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest));
-  printf("Indepdent movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, indepmovtest));
-  printf("Dependent movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depmovtest));
-  printf("eor -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, xorzerotest));
-  printf("mov -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, movzerotest));
-  printf("sub -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, subzerotest));
-
-
-  printf("Not taken jmps per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, jmptest));
-  printf("Jump fusion test: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fusejmptest));
-  printf("1:1 mixed not taken jmps / muls per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest));
-  printf("1:2 mixed not taken jmps / muls per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest21));
-  printf("1:1 mixed not taken jmps / adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmptest));
-  printf("1:2 mixed not taken jmps / adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmp21test));
-  printf("1:1 mixed add/mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmultest));
-  printf("2:1 mixed add/mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmul21test));
-  printf("ror per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, rortest));
-  printf("1:1 mixed mul/ror per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulrortest));
-  printf("32-bit mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test));
-  printf("64-bit mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test));
-  printf("scalar fp32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, faddwrapper));
-  printf("128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecadd128wrapper));
-  printf("128-bit vec int32 multiply per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecmul128wrapper));
-  printf("128-bit vec int32 mixed multiply and add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddmul128wrapper));
-  printf("128-bit vec fp32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfadd128wrapper));
-  printf("128-bit vec fp32 multiply per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfmul128wrapper));
-  printf("128-bit vec fp32 mixed multiply and add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfmul128wrapper));
-  printf("2:1 mixed scalar adds and 128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddvecadd128wrapper));
-  printf("3:1 mixed scalar adds and 128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix3to1addvecadd128wrapper));
-  printf("1:1 mixed scalar adds and 128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix1to1addvecadd128wrapper));
-  printf("1:1 mixed scalar 32-bit multiply and 128-bit vec int32 multiply per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulvecmulwrapper));
-  printf("1:1 mixed 128-bit vec fp32 multiply and 128-bit vec int32 multiply per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecmulfmulwrapper));
-  printf("1:1 mixed 128-bit vec fp32 add and 128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddfaddwrapper));
-  printf("1:2 mixed not taken jumps and 128-bit vec int32 add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecaddwrapper));
-  printf("1:1 mixed not taken jumps and 128-bit vec int32 mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecmulwrapper));
-  printf("128-bit vec loads per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecloadwrapper));
-  printf("128-bit vec stores per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecstorewrapper));
-  printf("64-bit loads per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, loadwrapper));
-  printf("1:1 mixed 64-bit loads/stores per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixloadstorewrapper));
-  printf("2:1 mixed 64-bit loads/stores per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix21loadstorewrapper));
-  printf("64-bit multiply latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul64test));
-  printf("128-bit vec int32 add latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecadd128wrapper));
-  printf("128-bit vec int32 mul latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecmul128wrapper));
-  printf("Scalar FADD Latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latfaddwrapper));
-  printf("128-bit vector FADD latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfadd128wrapper));
-  printf("128-bit vector FMUL latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfmul128wrapper));
-
-  printf("128-bit vector FMA per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfma128wrapper));
-  printf("128-bit vector FMA latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfma128wrapper));
-  printf("Scalar FMA per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, scalarfmawrapper));
-  printf("Scalar FMA latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latscalarfmawrapper));
-  printf("1:1 mixed 128-bit vector FMA/FADD per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfma128wrapper));
-  printf("1:1 mixed 128-bit vector FMA/FMUL per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfmulfma128wrapper));
+  printf("Estimated clock speed> %.2f GHz\n", clockSpeedGhz);
+
+  printf("Adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addtest));
+  printf("XORs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, eortest));
+  printf("CMPs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, cmptest));
+  printf("1:3 madd:add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, maddaddtest));
+  printf("Nops per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest));
+  printf("Indepdent movs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, indepmovtest));
+  printf("Dependent movs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depmovtest));
+  printf("eor -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, xorzerotest));
+  printf("mov -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, movzerotest));
+  printf("sub -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, subzerotest));
+
+
+  printf("Not taken jmps per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, jmptest));
+  printf("Jump fusion test> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fusejmptest));
+  printf("1:1 mixed not taken jmps / muls per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest));
+  printf("1:2 mixed not taken jmps / muls per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest21));
+  printf("1:1 mixed not taken jmps / adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmptest));
+  printf("1:2 mixed not taken jmps / adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmp21test));
+  printf("1:1 mixed add/mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmultest));
+  printf("2:1 mixed add/mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmul21test));
+  printf("ror per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, rortest));
+  printf("1:1 mixed mul/ror per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulrortest));
+  printf("32-bit mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test));
+  printf("64-bit mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test));
+  printf("scalar fp32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, faddwrapper));
+  printf("128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecadd128wrapper));
+  printf("128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecmul128wrapper));
+  printf("128-bit vec int32 mixed multiply and add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddmul128wrapper));
+  printf("128-bit vec fp32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfadd128wrapper));
+  printf("128-bit vec fp32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfmul128wrapper));
+  printf("128-bit vec fp32 mixed multiply and add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfmul128wrapper));
+  printf("2:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddvecadd128wrapper));
+  printf("3:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix3to1addvecadd128wrapper));
+  printf("1:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix1to1addvecadd128wrapper));
+  printf("1:1 mixed scalar 32-bit multiply and 128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulvecmulwrapper));
+  printf("1:1 mixed 128-bit vec fp32 multiply and 128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecmulfmulwrapper));
+  printf("1:1 mixed 128-bit vec fp32 add and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddfaddwrapper));
+  printf("1:2 mixed not taken jumps and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecaddwrapper));
+  printf("1:1 mixed not taken jumps and 128-bit vec int32 mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecmulwrapper));
+  printf("128-bit vec loads per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecloadwrapper));
+  printf("128-bit vec stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecstorewrapper));
+  printf("64-bit loads per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, loadwrapper));
+  printf("1:1 mixed 64-bit loads/stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixloadstorewrapper));
+  printf("2:1 mixed 64-bit loads/stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix21loadstorewrapper));
+  printf("64-bit multiply latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul64test));
+  printf("128-bit vec int32 add latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecadd128wrapper));
+  printf("128-bit vec int32 mul latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecmul128wrapper));
+  printf("Scalar FADD Latency> %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latfaddwrapper));
+  printf("128-bit vector FADD latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfadd128wrapper));
+  printf("128-bit vector FMUL latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfmul128wrapper));
+
+  printf("128-bit vector FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfma128wrapper));
+  printf("128-bit vector FMA latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfma128wrapper));
+  printf("Scalar FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, scalarfmawrapper));
+  printf("Scalar FMA latency> %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latscalarfmawrapper));
+  printf("1:1 mixed 128-bit vector FMA/FADD per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfma128wrapper));
+  printf("1:1 mixed 128-bit vector FMA/FMUL per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfmulfma128wrapper));
   return 0;
 }
 
diff --git a/instructionrate/arm_instructionrate.s b/instructionrate/arm_instructionrate.s
index 00a2e95..ab365fe 100644
--- a/instructionrate/arm_instructionrate.s
+++ b/instructionrate/arm_instructionrate.s
@@ -2,6 +2,9 @@
 
 .global clktest
 .global addtest
+.global eortest
+.global maddaddtest
+.global cmptest
 .global addmultest
 .global addmul21test
 .global mixaddjmp21test
@@ -192,8 +195,158 @@ addtest_loop:
   ldp x12, x13, [sp, #0x20]
   ldp x14, x15, [sp, #0x10]
   add sp, sp, #0x50
+  ret 
+
+maddaddtest:
+  sub sp, sp, #0x50
+  stp x14, x15, [sp, #0x10]
+  stp x12, x13, [sp, #0x20]
+  stp x10, x11, [sp, #0x30]
+  stp x8, x9, [sp, #0x40]
+  mov x15, 1
+  mov x14, 20
+  eor x13, x13, x13
+  eor x12, x12, x12
+  eor x11, x11, x11
+  mov x10, 2
+  eor x9, x9, x9
+  mov x8, 3
+maddaddtest_loop:
+  add x13, x13, x15
+  add x12, x12, x15
+  add x11, x11, x15
+  madd x10, x8, x0, x15
+  add x13, x13, x15
+  add x12, x12, x15
+  add x11, x11, x15
+  madd x10, x8, x0, x15 
+  add x13, x13, x15
+  add x12, x12, x15
+  add x11, x11, x15
+  madd x10, x8, x0, x15  
+  add x13, x13, x15
+  add x12, x12, x15
+  add x11, x11, x15
+  madd x10, x8, x0, x15  
+  add x13, x13, x15
+  add x12, x12, x15
+  add x11, x11, x15
+  madd x10, x8, x0, x15  
+  sub x0, x0, x14
+  cbnz x0, maddaddtest_loop
+  ldp x8, x9, [sp, #0x40]
+  ldp x10, x11, [sp, #0x30]
+  ldp x12, x13, [sp, #0x20]
+  ldp x14, x15, [sp, #0x10]
+  add sp, sp, #0x50
+  ret 
+
+eortest:
+  sub sp, sp, #0x50
+  stp x14, x15, [sp, #0x10]
+  stp x12, x13, [sp, #0x20]
+  stp x10, x11, [sp, #0x30]
+  stp x8, x9, [sp, #0x40]
+  mov x15, 1
+  mov x14, 30
+  eor x13, x13, x13
+  eor x12, x12, x12
+  eor x11, x11, x11
+  eor x10, x10, x10
+  eor x9, x9, x9
+eortest_loop:
+  eor x13, x13, x15
+  eor x12, x12, x15
+  eor x11, x11, x15
+  eor x10, x10, x15
+  eor x9, x9, x15
+  eor x13, x13, x15
+  eor x12, x12, x15
+  eor x11, x11, x15
+  eor x10, x10, x15
+  eor x9, x9, x15
+  eor x13, x13, x15
+  eor x12, x12, x15
+  eor x11, x11, x15
+  eor x10, x10, x15
+  eor x9, x9, x15
+  eor x13, x13, x15
+  eor x12, x12, x15
+  eor x11, x11, x15
+  eor x10, x10, x15
+  eor x9, x9, x15
+  eor x13, x13, x15
+  eor x12, x12, x15
+  eor x11, x11, x15
+  eor x10, x10, x15
+  eor x9, x9, x15
+  eor x13, x13, x15
+  eor x12, x12, x15
+  eor x11, x11, x15
+  eor x10, x10, x15
+  eor x9, x9, x15
+  sub x0, x0, x14
+  cbnz x0, eortest_loop
+  ldp x8, x9, [sp, #0x40]
+  ldp x10, x11, [sp, #0x30]
+  ldp x12, x13, [sp, #0x20]
+  ldp x14, x15, [sp, #0x10]
+  add sp, sp, #0x50
   ret
 
+cmptest:
+  sub sp, sp, #0x50
+  stp x14, x15, [sp, #0x10]
+  stp x12, x13, [sp, #0x20]
+  stp x10, x11, [sp, #0x30]
+  stp x8, x9, [sp, #0x40]
+  mov x15, 1
+  mov x14, 30
+  eor x13, x13, x13
+  eor x12, x12, x12
+  eor x11, x11, x11
+  eor x10, x10, x10
+  eor x9, x9, x9
+cmptest_loop:
+  cmp x13, x13
+  cmp x12, x12
+  cmp x11, x11
+  cmp x10, x10
+  cmp x9, x9 
+  cmp x13, x13
+  cmp x12, x12
+  cmp x11, x11
+  cmp x10, x10
+  cmp x9, x9 
+  cmp x13, x13
+  cmp x12, x12
+  cmp x11, x11
+  cmp x10, x10
+  cmp x9, x9 
+  cmp x13, x13
+  cmp x12, x12
+  cmp x11, x11
+  cmp x10, x10
+  cmp x9, x9 
+  cmp x13, x13
+  cmp x12, x12
+  cmp x11, x11
+  cmp x10, x10
+  cmp x9, x9 
+  cmp x13, x13
+  cmp x12, x12
+  cmp x11, x11
+  cmp x10, x10
+  cmp x9, x9 
+  sub x0, x0, x14
+  cbnz x0, cmptest_loop
+  ldp x8, x9, [sp, #0x40]
+  ldp x10, x11, [sp, #0x30]
+  ldp x12, x13, [sp, #0x20]
+  ldp x14, x15, [sp, #0x10]
+  add sp, sp, #0x50
+  ret 
+
 addmultest:
   sub sp, sp, #0x50
   stp x14, x15, [sp, #0x10]