From 76fc8589107a2b8655f41ba4651cf4fc822e2009 Mon Sep 17 00:00:00 2001
From: clamchowder <lamchester@gmail.com>
Date: Fri, 23 Feb 2024 20:16:52 -0800
Subject: [PATCH] aaaaa

---
 GpuMemLatency/instruction_rate.c | 23 ++++++++++++++++++-----
 GpuMemLatency/opencltest.c       | 14 +++++++++++---
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/GpuMemLatency/instruction_rate.c b/GpuMemLatency/instruction_rate.c
index b30c47d..96cf19b 100644
--- a/GpuMemLatency/instruction_rate.c
+++ b/GpuMemLatency/instruction_rate.c
@@ -335,7 +335,7 @@ float run_divergence_rate_test(cl_context context,
 {
     size_t global_item_size = thread_count;
     size_t local_item_size = local_size;
-    uint32_t actual_threads = thread_count;
+    uint32_t active_threads = thread_count;
     cl_int ret;
     float totalOps, gOpsPerSec;
     uint64_t time_diff_ms = 0;
@@ -348,8 +348,9 @@ float run_divergence_rate_test(cl_context context,
     float* A = (float*)malloc(sizeof(float) * thread_count);
     memset(result, 0, sizeof(float) * thread_count);
 
-    if (partitionPattern != NULL) actual_threads = 0;
+    if (partitionPattern != NULL) active_threads = 0;
 
+    fprintf(stderr, "\n");
     for (int i = 0; i < thread_count; i++)
     {
         if (partitionPattern == NULL) {
@@ -361,12 +362,24 @@ float run_divergence_rate_test(cl_context context,
         {
             if (partitionPattern[(i / wave)]) {
                 A[i] = 0.2f;
-                actual_threads++;
+                fprintf(stderr, "a ");
+                active_threads++;
+            }
+            else
+            {
+                fprintf(stderr, "_ ");
+                A[i] = 1.2f;
+            }
+
+            if ((i + 1) % wave == 0)
+            {
+                fprintf(stderr, "\n");
             }
-            else A[i] = 1.2f;
         }
     }
 
+    fprintf(stderr, "\nActive threads: %d\n", active_threads);
+
     cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, thread_count * sizeof(float), NULL, &ret);
     cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, thread_count * sizeof(float), NULL, &ret);
     ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, thread_count * sizeof(float), A, 0, NULL, NULL);
@@ -397,7 +410,7 @@ float run_divergence_rate_test(cl_context context,
 
         time_diff_ms = end_timing();
 
-        totalOps = (float)chase_iterations * 8 * (float)actual_threads;
+        totalOps = (float)chase_iterations * 8 * (float)active_threads;
         gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000);
         //fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count);
         //fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms);
diff --git a/GpuMemLatency/opencltest.c b/GpuMemLatency/opencltest.c
index c4f590f..94595b2 100644
--- a/GpuMemLatency/opencltest.c
+++ b/GpuMemLatency/opencltest.c
@@ -690,11 +690,19 @@ int main(int argc, char* argv[]) {
     }
     else if (testType == Partition)
     {
-        int pattern[] = { 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0 };
+        int pattern4[] = { 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0 };
 
         // function and its associated kernel serve two purposes
-        float result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, pattern);
-        printf("Throughput: %f\n", result);
+        float result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, pattern4);
+        printf("Throughput (mod 4): %f\n", result);
+
+        int pattern2[] = { 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+        result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, pattern2);
+        printf("Throughput (mod 2): %f\n", result);
+
+        int consec_pattern[] = { 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+        result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, consec_pattern);
+        printf("Throughput (x4): %f\n", result);
     }
 
     //printf("If you didn't run this through cmd, now you can copy the results. And press ctrl+c to close");