From 8341a0410bc58d7687e2fcea681b9dc05562cf0f Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 27 Nov 2024 01:55:32 -0500
Subject: [PATCH] [Kernel] Remove if-else with identical branches in marlin 2:4
 (#10687)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 .../marlin/sparse/marlin_24_cuda_kernel.cu             | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
index 8fce76eb52f9b..17837351324be 100644
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -296,13 +296,9 @@ __global__ void Marlin_24(
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
   // row-major in the latter case.
-  if (group_blocks != -1) {
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  } else {
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  }
+  s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+            (threadIdx.x % 32) / 4;  // Note that in the original Marlin kernel
+                                     // this is (threadIdx.x % 32) / 4
 
   // Precompute which thread should not read memory in which iterations; this is
   // needed if there are more threads than required for a certain tilesize or