From 942de54f160eb6bce26162e88ea90961fa5d6f2c Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 11 Dec 2024 10:54:11 -0500
Subject: [PATCH 1/4] Add cluster_dims param to MatmulParams

Following #3557 we can specify the cluster size for our fusions.
Currently we don't do anything explicitly with CGAs, but this can help
guarantee that tiles are scheduled onto GPCs in pairs. Each GPC has a
number of TPCs, each of which holds 2 SMs, so this lets us take
advantage of caching at the TPC and GPC level for operand loads, in
addition to L2.
---
 csrc/codegen.cpp                       | 3 +--
 csrc/scheduler/hopper_multi_matmul.cpp | 2 ++
 csrc/scheduler/hopper_multi_matmul.h   | 8 ++++++++
 csrc/scheduler/matmul_heuristic.h      | 4 ++++
 tests/cpp/test_matmul.cpp              | 5 ++---
 5 files changed, 17 insertions(+), 5 deletions(-)
diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
index 0060e626fe6..066670cc15d 100644
--- a/csrc/codegen.cpp
+++ b/csrc/codegen.cpp
@@ -276,8 +276,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
     code_ << "__global__ void ";
     if (kernel_->hasManaged("cluster_dims")) {
       auto cluster_dims =
-          kernel_->getManaged<std::tuple<int64_t, int64_t, int64_t>>(
-              "cluster_dims");
+          kernel_->getManaged<std::tuple<int, int, int>>("cluster_dims");
       code_ << "__cluster_dims__(" << std::get<0>(cluster_dims) << ", "
             << std::get<1>(cluster_dims) << ", " << std::get<2>(cluster_dims)
             << ") ";
diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp
index fbb95d46df2..a5b6f4d2bb7 100644
--- a/csrc/scheduler/hopper_multi_matmul.cpp
+++ b/csrc/scheduler/hopper_multi_matmul.cpp
@@ -53,6 +53,8 @@ void HopperMultipleMatmulScheduler::run() {
 
   inspectPrologues();
 
+  setCGADims();
+
   scheduleOperands();
 
   // schedule mma instruction output (mma_result)
diff --git a/csrc/scheduler/hopper_multi_matmul.h b/csrc/scheduler/hopper_multi_matmul.h
index bf7bc1df0f5..1d77785cc99 100644
--- a/csrc/scheduler/hopper_multi_matmul.h
+++ b/csrc/scheduler/hopper_multi_matmul.h
@@ -149,6 +149,14 @@ class HopperMultipleMatmulScheduler : public MultipleMatmulScheduler {
   std::vector<std::vector<MatmulDimRole>> blockTileTensors(
       const std::vector<TensorView*>& tvs);
 
+  //! Specifies the CGA dimensions by setting "cluster_dims" as fusion-managed
+  //! data
+  void setCGADims() const {
+    if (params_->cluster_dims != std::tuple<int, int, int>{1, 1, 1}) {
+      fusion_->manage("cluster_dims", params_->cluster_dims);
+    }
+  }
+
   //! Schedule the loads of all operands from global memory to shared memory.
   //! Starting from the basic tiled schedule, we swizzle the operand memory.
   //! Note that the cache op and LoadStoreOpType are already set during
diff --git a/csrc/scheduler/matmul_heuristic.h b/csrc/scheduler/matmul_heuristic.h
index 6a92d31fd2c..359f6332b51 100644
--- a/csrc/scheduler/matmul_heuristic.h
+++ b/csrc/scheduler/matmul_heuristic.h
@@ -179,6 +179,10 @@ class MatmulParams : public HeuristicParams {
   //! axis and perform a grid reduction before the epilogue.
   int splitk_factor = 1;
 
+  //! This is the CGA size on Hopper+ devices. This parameter is ignored on
+  //! Ampere and Turing.
+  std::tuple<int, int, int> cluster_dims = {2, 1, 1};
+
   std::string toString() const override {
     std::stringstream ss;
     ss << "\n===== Matmul Parameters ========\n"
diff --git a/tests/cpp/test_matmul.cpp b/tests/cpp/test_matmul.cpp
index 88cc953c95d..1a95cdfd7f8 100644
--- a/tests/cpp/test_matmul.cpp
+++ b/tests/cpp/test_matmul.cpp
@@ -3663,7 +3663,7 @@ TEST_F(HopperMatmulTest, HSH_NT_128BSwizzle) {
   const int64_t cta_m = 2 * getM(macro);
   const int64_t cta_n = 1 * getN(macro);
 
-  constexpr std::tuple<int64_t, int64_t, int64_t> cluster_dims{2, 1, 1};
+  constexpr std::tuple<int, int, int> cluster_dims{2, 1, 1};
 
   auto tv0 = makeContigConcreteTensor({-1, -1, 1}, dtype);
   auto tv1 = makeContigConcreteTensor({-1, 1, -1}, dtype);
@@ -3680,8 +3680,7 @@ TEST_F(HopperMatmulTest, HSH_NT_128BSwizzle) {
   auto tv3 = castOp(DataType::Half, tv2);
   fusion.addOutput(tv3);
 
-  if constexpr (
-      cluster_dims != std::tuple<int64_t, int64_t, int64_t>{1, 1, 1}) {
+  if constexpr (cluster_dims != std::tuple<int, int, int>{1, 1, 1}) {
     fusion.manage("cluster_dims", cluster_dims);
   }
 

From e3f611c86d0fdfda940d8b098e6cb6e979a0b1ea Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 11 Dec 2024 11:09:38 -0500
Subject: [PATCH 2/4] Move setCGADims to MultiMatmulScheduler, enable on Ampere

---
 csrc/scheduler/ampere_multi_matmul.cpp | 2 ++
 csrc/scheduler/hopper_multi_matmul.h   | 8 --------
 csrc/scheduler/multi_matmul.h          | 8 ++++++++
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/csrc/scheduler/ampere_multi_matmul.cpp b/csrc/scheduler/ampere_multi_matmul.cpp
index ee21e41ce8b..9bf9cd61e3e 100644
--- a/csrc/scheduler/ampere_multi_matmul.cpp
+++ b/csrc/scheduler/ampere_multi_matmul.cpp
@@ -459,6 +459,8 @@ void AmpereMultipleMatmulScheduler::run() {
   // This also collects mma_results_
   defineOperandCaches();
 
+  setCGADims();
+
   // Schedules:
   //   - global->smem (cp.async)
   //   - smem->register (ldmatrix)
diff --git a/csrc/scheduler/hopper_multi_matmul.h b/csrc/scheduler/hopper_multi_matmul.h
index 1d77785cc99..bf7bc1df0f5 100644
--- a/csrc/scheduler/hopper_multi_matmul.h
+++ b/csrc/scheduler/hopper_multi_matmul.h
@@ -149,14 +149,6 @@ class HopperMultipleMatmulScheduler : public MultipleMatmulScheduler {
   std::vector<std::vector<MatmulDimRole>> blockTileTensors(
       const std::vector<TensorView*>& tvs);
 
-  //! Specifies the CGA dimensions by setting "cluster_dims" as fusion-managed
-  //! data
-  void setCGADims() const {
-    if (params_->cluster_dims != std::tuple<int, int, int>{1, 1, 1}) {
-      fusion_->manage("cluster_dims", params_->cluster_dims);
-    }
-  }
-
   //! Schedule the loads of all operands from global memory to shared memory.
   //! Starting from the basic tiled schedule, we swizzle the operand memory.
   //! Note that the cache op and LoadStoreOpType are already set during
diff --git a/csrc/scheduler/multi_matmul.h b/csrc/scheduler/multi_matmul.h
index a08e9bb3f86..8da5ffb6ef7 100644
--- a/csrc/scheduler/multi_matmul.h
+++ b/csrc/scheduler/multi_matmul.h
@@ -46,6 +46,14 @@ class MultipleMatmulScheduler {
   //! that creates a new TensorView, such as caching or rFactor
   void updateIdModel();
 
+  //! Specifies the CGA dimensions by setting "cluster_dims" as fusion-managed
+  //! data
+  void setCGADims() const {
+    if (params_->cluster_dims != std::tuple<int, int, int>{1, 1, 1}) {
+      fusion_->manage("cluster_dims", params_->cluster_dims);
+    }
+  }
+
  protected:
   Fusion* fusion_;
   const MatmulParams* params_;

From 193979a16bcf149c248fe87f73ea78ae6e26ff13 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 11 Dec 2024 12:34:16 -0500
Subject: [PATCH 3/4] Revert "Move setCGADims to MultiMatmulScheduler, enable
 on Ampere"

This reverts commit e3f611c86d0fdfda940d8b098e6cb6e979a0b1ea.
---
 csrc/scheduler/ampere_multi_matmul.cpp | 2 --
 csrc/scheduler/hopper_multi_matmul.h   | 8 ++++++++
 csrc/scheduler/multi_matmul.h          | 8 --------
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/csrc/scheduler/ampere_multi_matmul.cpp b/csrc/scheduler/ampere_multi_matmul.cpp
index 9bf9cd61e3e..ee21e41ce8b 100644
--- a/csrc/scheduler/ampere_multi_matmul.cpp
+++ b/csrc/scheduler/ampere_multi_matmul.cpp
@@ -459,8 +459,6 @@ void AmpereMultipleMatmulScheduler::run() {
   // This also collects mma_results_
   defineOperandCaches();
 
-  setCGADims();
-
   // Schedules:
   //   - global->smem (cp.async)
   //   - smem->register (ldmatrix)
diff --git a/csrc/scheduler/hopper_multi_matmul.h b/csrc/scheduler/hopper_multi_matmul.h
index bf7bc1df0f5..1d77785cc99 100644
--- a/csrc/scheduler/hopper_multi_matmul.h
+++ b/csrc/scheduler/hopper_multi_matmul.h
@@ -149,6 +149,14 @@ class HopperMultipleMatmulScheduler : public MultipleMatmulScheduler {
   std::vector<std::vector<MatmulDimRole>> blockTileTensors(
       const std::vector<TensorView*>& tvs);
 
+  //! Specifies the CGA dimensions by setting "cluster_dims" as fusion-managed
+  //! data
+  void setCGADims() const {
+    if (params_->cluster_dims != std::tuple<int, int, int>{1, 1, 1}) {
+      fusion_->manage("cluster_dims", params_->cluster_dims);
+    }
+  }
+
   //! Schedule the loads of all operands from global memory to shared memory.
   //! Starting from the basic tiled schedule, we swizzle the operand memory.
   //! Note that the cache op and LoadStoreOpType are already set during
diff --git a/csrc/scheduler/multi_matmul.h b/csrc/scheduler/multi_matmul.h
index 8da5ffb6ef7..a08e9bb3f86 100644
--- a/csrc/scheduler/multi_matmul.h
+++ b/csrc/scheduler/multi_matmul.h
@@ -46,14 +46,6 @@ class MultipleMatmulScheduler {
   //! that creates a new TensorView, such as caching or rFactor
   void updateIdModel();
 
-  //! Specifies the CGA dimensions by setting "cluster_dims" as fusion-managed
-  //! data
-  void setCGADims() const {
-    if (params_->cluster_dims != std::tuple<int, int, int>{1, 1, 1}) {
-      fusion_->manage("cluster_dims", params_->cluster_dims);
-    }
-  }
-
  protected:
   Fusion* fusion_;
   const MatmulParams* params_;

From 6876bcee9ed038e9336d485f3517a673760fcef5 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 11 Dec 2024 12:57:00 -0500
Subject: [PATCH 4/4] Switch from int->int64_t

---
 csrc/codegen.cpp                  | 3 ++-
 csrc/scheduler/matmul_heuristic.h | 2 +-
 tests/cpp/test_matmul.cpp         | 5 +++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
index 066670cc15d..0060e626fe6 100644
--- a/csrc/codegen.cpp
+++ b/csrc/codegen.cpp
@@ -276,7 +276,8 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
     code_ << "__global__ void ";
     if (kernel_->hasManaged("cluster_dims")) {
       auto cluster_dims =
-          kernel_->getManaged<std::tuple<int, int, int>>("cluster_dims");
+          kernel_->getManaged<std::tuple<int64_t, int64_t, int64_t>>(
+              "cluster_dims");
       code_ << "__cluster_dims__(" << std::get<0>(cluster_dims) << ", "
             << std::get<1>(cluster_dims) << ", " << std::get<2>(cluster_dims)
             << ") ";
diff --git a/csrc/scheduler/matmul_heuristic.h b/csrc/scheduler/matmul_heuristic.h
index 359f6332b51..f66cd12e618 100644
--- a/csrc/scheduler/matmul_heuristic.h
+++ b/csrc/scheduler/matmul_heuristic.h
@@ -181,7 +181,7 @@ class MatmulParams : public HeuristicParams {
 
   //! This is the CGA size on Hopper+ devices. This parameter is ignored on
   //! Ampere and Turing.
-  std::tuple<int, int, int> cluster_dims = {2, 1, 1};
+  std::tuple<int64_t, int64_t, int64_t> cluster_dims = {2, 1, 1};
 
   std::string toString() const override {
     std::stringstream ss;
diff --git a/tests/cpp/test_matmul.cpp b/tests/cpp/test_matmul.cpp
index 1a95cdfd7f8..88cc953c95d 100644
--- a/tests/cpp/test_matmul.cpp
+++ b/tests/cpp/test_matmul.cpp
@@ -3663,7 +3663,7 @@ TEST_F(HopperMatmulTest, HSH_NT_128BSwizzle) {
   const int64_t cta_m = 2 * getM(macro);
   const int64_t cta_n = 1 * getN(macro);
 
-  constexpr std::tuple<int, int, int> cluster_dims{2, 1, 1};
+  constexpr std::tuple<int64_t, int64_t, int64_t> cluster_dims{2, 1, 1};
 
   auto tv0 = makeContigConcreteTensor({-1, -1, 1}, dtype);
   auto tv1 = makeContigConcreteTensor({-1, 1, -1}, dtype);
@@ -3680,7 +3680,8 @@ TEST_F(HopperMatmulTest, HSH_NT_128BSwizzle) {
   auto tv3 = castOp(DataType::Half, tv2);
   fusion.addOutput(tv3);
 
-  if constexpr (cluster_dims != std::tuple<int, int, int>{1, 1, 1}) {
+  if constexpr (
+      cluster_dims != std::tuple<int64_t, int64_t, int64_t>{1, 1, 1}) {
     fusion.manage("cluster_dims", cluster_dims);
   }