Add the heuristics for fp8

vllm-project · Nov 19, 2024 · 540d0ce · 540d0ce
1 parent abfd85d
commit 540d0ce
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 21 deletions.
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
@@ -101,8 +101,8 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.float16)
-    out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.float16)
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)
 
     if not torch.allclose(out.t(), out_ref):
         print("Incorrect result")

diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -57,24 +57,22 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   uint32_t const mp2 =
       std::max(static_cast<uint32_t>(64), next_pow_2(n));  // next power of 2
 
-  // if (mp2 <= 64) {
-  //   // n in [1, 64]
-  //   return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
-  //       out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  // } else if (mp2 <= 128) {
-  if (mp2 <= 128) {
+  if (mp2 <= 64) {
+    // n in [1, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
     // n in (64, 128]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  // } else if (mp2 <= 256) {
-  } else {
+  } else if (mp2 <= 256) {
     // n in (128, 256]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  // } else {
-  //   // n in (256, inf)
-  //   return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
-  //       out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // n in (256, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
   }
 }
 

diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -533,9 +533,9 @@ struct sm90_fp8_config_M64 {
   // M in [1, 64]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using TileShape = Shape<_128, _64, _256>;
+  using TileShape = Shape<_64, _64, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
   using TileSchedule = cutlass::gemm::PersistentScheduler;
@@ -552,9 +552,9 @@ struct sm90_fp8_config_M128 {
   // M in (64, 128]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using TileShape = Shape<_128, _64, _256>;
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
   using TileSchedule = cutlass::gemm::PersistentScheduler;
@@ -590,9 +590,9 @@ struct sm90_fp8_config_M512 {
   // M in (256, ]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using TileShape = Shape<_256, _128, _128>;
+  using TileShape = Shape<_128, _128, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
   using TileSchedule = cutlass::gemm::PersistentScheduler;