diff --git a/GpuMemLatency/instruction_rate.c b/GpuMemLatency/instruction_rate.c index 6aa6b0d..8cc9c97 100644 --- a/GpuMemLatency/instruction_rate.c +++ b/GpuMemLatency/instruction_rate.c @@ -59,7 +59,7 @@ float instruction_rate_test(cl_context context, float gOpsPerSec = 0, opsPerIteration; cl_int ret; int64_t time_diff_ms; - int float4_element_count = local_size * 4; + int float4_element_count = thread_count * 4; cl_program program = build_program(context, "instruction_rate_kernel.cl"); cl_kernel int32_add_rate_kernel = clCreateKernel(program, "int32_add_rate_test", &ret); @@ -101,7 +101,7 @@ float instruction_rate_test(cl_context context, } // 4x int4 * 8 per iteration, and count the loop increment too - opsPerIteration = 4.0f * 8.0f + 1.0f; + opsPerIteration = 4.0f * 8.0f; float int32_add_rate = run_rate_test(context, command_queue, int32_add_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "INT32 G Adds/sec: %f\n", int32_add_rate); @@ -255,6 +255,8 @@ uint32_t adjust_iterations(uint32_t iterations, uint64_t time_ms) return chase_iterations; } +// Runs an instruction rate test. The kernel is expected to perform opsPerIteration * chase_iterations operations +// Returns GOPS float run_rate_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, @@ -271,7 +273,7 @@ float run_rate_test(cl_context context, size_t global_item_size = thread_count; size_t local_item_size = local_size; cl_int ret; - float gOpsPerSec; + float totalOps, gOpsPerSec; uint64_t time_diff_ms = 0; memset(result, 0, sizeof(float) * 4 * thread_count); @@ -304,14 +306,16 @@ float run_rate_test(cl_context context, } time_diff_ms = end_timing(); + + totalOps = (float)chase_iterations * opsPerIteration * (float)thread_count; + gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000); + //fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count); + //fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms); + chase_iterations = adjust_iterations(chase_iterations, time_diff_ms); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); } - float totalOps = (float)chase_iterations * opsPerIteration * (float)thread_count; - gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000); - //fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count); - //fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms); return gOpsPerSec; }