NVIDIA · jacobhinkle · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
@@ -276,8 +276,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
     code_ << "__global__ void ";
     if (kernel_->hasManaged("cluster_dims")) {
       auto cluster_dims =
-          kernel_->getManaged<std::tuple<int64_t, int64_t, int64_t>>(
-              "cluster_dims");
+          kernel_->getManaged<std::tuple<int, int, int>>("cluster_dims");
       code_ << "__cluster_dims__(" << std::get<0>(cluster_dims) << ", "
             << std::get<1>(cluster_dims) << ", " << std::get<2>(cluster_dims)
             << ") ";

diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp
@@ -53,6 +53,8 @@ void HopperMultipleMatmulScheduler::run() {
 
   inspectPrologues();
 
+  setCGADims();
+
   scheduleOperands();
 
   // schedule mma instruction output (mma_result)

diff --git a/csrc/scheduler/hopper_multi_matmul.h b/csrc/scheduler/hopper_multi_matmul.h
@@ -149,6 +149,14 @@ class HopperMultipleMatmulScheduler : public MultipleMatmulScheduler {
   std::vector<std::vector<MatmulDimRole>> blockTileTensors(
       const std::vector<TensorView*>& tvs);
 
+  //! Specifies the CGA dimensions by setting "cluster_dims" as fusion-managed
+  //! data
+  void setCGADims() const {
+    if (params_->cluster_dims != std::tuple<int, int, int>{1, 1, 1}) {
+      fusion_->manage("cluster_dims", params_->cluster_dims);
+    }
+  }
+
   //! Schedule the loads of all operands from global memory to shared memory.
   //! Starting from the basic tiled schedule, we swizzle the operand memory.
   //! Note that the cache op and LoadStoreOpType are already set during

diff --git a/csrc/scheduler/matmul_heuristic.h b/csrc/scheduler/matmul_heuristic.h
@@ -179,6 +179,10 @@ class MatmulParams : public HeuristicParams {
   //! axis and perform a grid reduction before the epilogue.
   int splitk_factor = 1;
 
+  //! This is the CGA size on Hopper+ devices. This parameter is ignored on
+  //! Ampere and Turing.
+  std::tuple<int, int, int> cluster_dims = {2, 1, 1};
+
   std::string toString() const override {
     std::stringstream ss;
     ss << "\n===== Matmul Parameters ========\n"

diff --git a/tests/cpp/test_matmul.cpp b/tests/cpp/test_matmul.cpp
@@ -3663,7 +3663,7 @@ TEST_F(HopperMatmulTest, HSH_NT_128BSwizzle) {
   const int64_t cta_m = 2 * getM(macro);
   const int64_t cta_n = 1 * getN(macro);
 
-  constexpr std::tuple<int64_t, int64_t, int64_t> cluster_dims{2, 1, 1};
+  constexpr std::tuple<int, int, int> cluster_dims{2, 1, 1};
-  constexpr std::tuple<int, int, int> cluster_dims{2, 1, 1};
+  constexpr std::tuple<int64_t, int64_t, int64_t> cluster_dims{2, 1, 1};
-  constexpr std::tuple<int, int, int> cluster_dims{2, 1, 1};
+  constexpr std::tuple<int64_t, int64_t, int64_t> cluster_dims{2, 1, 1};
 
   auto tv0 = makeContigConcreteTensor({-1, -1, 1}, dtype);
   auto tv1 = makeContigConcreteTensor({-1, 1, -1}, dtype);
@@ -3680,8 +3680,7 @@ TEST_F(HopperMatmulTest, HSH_NT_128BSwizzle) {
   auto tv3 = castOp(DataType::Half, tv2);
   fusion.addOutput(tv3);
 
-  if constexpr (
-      cluster_dims != std::tuple<int64_t, int64_t, int64_t>{1, 1, 1}) {
+  if constexpr (cluster_dims != std::tuple<int, int, int>{1, 1, 1}) {
     fusion.manage("cluster_dims", cluster_dims);
   }