diff --git a/InstructionRate/x86_instructionrate.c b/InstructionRate/x86_instructionrate.c index e56a82e..10913cc 100644 --- a/InstructionRate/x86_instructionrate.c +++ b/InstructionRate/x86_instructionrate.c @@ -119,6 +119,7 @@ extern uint64_t aesencadd128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t aesencfma128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t aesencmul128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mix256faddintadd(uint64_t iterations) __attribute((sysv_abi)); +extern uint64_t movqtoxmmtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t fma4_256(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t fma4_128(uint64_t iterations) __attribute((sysv_abi)); @@ -448,6 +449,9 @@ int main(int argc, char *argv[]) { if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "fmul256", 6) == 0)) printf("256-bit FMUL per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mul256fp)); + if (testName == NULL || argc > 1 && strncmp(argv[1], "movqtoxmm", 9) == 0) + printf("MOVQ GPR <-> XMM: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, movqtoxmmtest)); + // integer multiply. zhaoxin appears to handle 16-bit and 64-bit multiplies differntly // unlike Intel/AMD CPUs that behave similarly regardless of register width if (testName == NULL || argc > 1 && strncmp(argv[1], "latmul16", 8) == 0) diff --git a/InstructionRate/x86_instructionrate.s b/InstructionRate/x86_instructionrate.s index 8111146..5fc4121 100644 --- a/InstructionRate/x86_instructionrate.s +++ b/InstructionRate/x86_instructionrate.s @@ -92,6 +92,7 @@ .global add128int .global mul128int .global mix256faddintadd +.global movqtoxmmtest .global pdeptest .global pexttest @@ -5230,7 +5231,6 @@ fdivtest_loop: sub %r9, %rdi jnz fdivtest_loop movq %xmm1, %rax - vzeroupper pop %r8 pop %r9 ret @@ -5365,3 +5365,38 @@ fmuldenormtest_loop: pop %r8 pop %r9 ret + +movqtoxmmtest: + push %r9 + push %r8 + push %r10 + mov $20, %r9 + mov $123, %r10 +movqtoxmmtest_loop: + movq %r10, %xmm1 + movq %xmm1, %r10 + movq %r10, %xmm1 + movq %xmm1, %r10 + movq %r10, %xmm1 + movq %xmm1, %r10 + movq %r10, %xmm1 + movq %xmm1, %r10 + movq %r10, %xmm1 + movq %xmm1, %r10 + movq %r10, %xmm1 + movq %xmm1, %r10 + movq %r10, %xmm1 + movq %xmm1, %r10 + movq %r10, %xmm1 + movq %xmm1, %r10 + movq %r10, %xmm1 + movq %xmm1, %r10 + movq %r10, %xmm1 + movq %xmm1, %r10 + sub %r9, %rdi + jnz movqtoxmmtest_loop + movq %xmm1, %rax + pop %r10 + pop %r8 + pop %r9 + ret diff --git a/LoadedMemoryLatency/LoadedMemoryLatency.c b/LoadedMemoryLatency/LoadedMemoryLatency.c index df08f6f..9a3acc7 100644 --- a/LoadedMemoryLatency/LoadedMemoryLatency.c +++ b/LoadedMemoryLatency/LoadedMemoryLatency.c @@ -48,6 +48,14 @@ int main(int argc, char *argv[]) { int coreCount = get_nprocs(); int latencyCore = 0; int *customCores = NULL; + if (argc == 1) { + fprintf(stderr, "Options:\n"); + fprintf(stderr, "-bwthreads [int]: Number of bandwidth test threads\n"); + fprintf(stderr, "-latencyaffinity [int]: Core to run latency test thread on\n"); + fprintf(stderr, "-bwcores [comma separated list]: Cores to run bandwidth load on\n"); + fprintf(stderr, "-scaleiterations [int]: Iterations scaling factor\n"); + fprintf(stderr, "-throttle [int]: Reduce bandwidth load per bandwidth test thread\n"); + } for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char *arg = argv[argIdx] + 1; diff --git a/MemoryBandwidth/MemoryBandwidth.c b/MemoryBandwidth/MemoryBandwidth.c index b1f9c40..a3927cc 100644 --- a/MemoryBandwidth/MemoryBandwidth.c +++ b/MemoryBandwidth/MemoryBandwidth.c @@ -34,8 +34,8 @@ #pragma GCC diagnostic ignored "-Wattributes" -int default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048, - 3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304, +int default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048, 2560, + 3072, 4096, 5120, 6144, 8192, 10240, 12288, 14336, 15360, 16384, 18432, 20480, 24567, 32768, 65536, 98304, 131072, 262144, 393216, 524288, 1048576, 1572864, 2097152, 3145728 }; typedef struct BandwidthTestThreadData { diff --git a/MemoryBandwidth/MemoryBandwidth_x86.s b/MemoryBandwidth/MemoryBandwidth_x86.s index a8ca4e3..2656972 100644 --- a/MemoryBandwidth/MemoryBandwidth_x86.s +++ b/MemoryBandwidth/MemoryBandwidth_x86.s @@ -797,7 +797,7 @@ repstosb_copy_pass_loop: rep stosb dec %r8 jnz repstosb_copy_pass_loop - movss (%r12), %xmm0 + movss (%r13), %xmm0 pop %rdi pop %rsi pop %r12 @@ -823,7 +823,7 @@ repstosd_copy_pass_loop: rep stosl dec %r8 jnz repstosd_copy_pass_loop - movss (%r12), %xmm0 + movss (%r13), %xmm0 pop %rdi pop %rsi pop %r12 diff --git a/MemoryLatency/MemoryLatency.c b/MemoryLatency/MemoryLatency.c index 76300ab..6cbc622 100644 --- a/MemoryLatency/MemoryLatency.c +++ b/MemoryLatency/MemoryLatency.c @@ -25,8 +25,8 @@ #define PAGE_SIZE 4096 #define CACHELINE_SIZE 64 -int default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048, - 3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304, +int default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048, 2304, 2560, + 3072, 4096, 5120, 6144, 8192, 10240, 12288, 13312, 14336, 15360, 16384, 18432, 20480, 24567, 32768, 65536, 98304, 131072, 262144, 393216, 524288, 1048576 }; //2097152 }; #ifdef __x86_64