[Bugfix][Build/CI] Fix sparse CUTLASS compilation on CUDA [12.0, 12.2)

Signed-off-by: Tyler Michael Smith <[email protected]>
vllm-project · Dec 18, 2024 · 4b8652d · 4b8652d
1 parent ca5f54a
commit 4b8652d
Show file tree

Hide file tree

Showing 10 changed files with 176 additions and 15 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -273,15 +273,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                    " in CUDA target architectures")
   endif()
 
-  #
-  # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
-  # For Hopper (c3x, i.e. CUTLASS 3.x) require
+  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
-             "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
-             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -290,12 +286,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
+      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 sparse or quantized models on "
+                     "later if you intend on running FP8 quantized models on "
                      "Hopper.")
     else()
-      message(STATUS "Not building cutlass_c3x as no compatible archs found "
+      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
                      "in CUDA target architectures")
     endif()
 
@@ -329,6 +325,31 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
+  #
+  # 2:4 Sparse Kernels
+
+  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
+  # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+    set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
+             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
+    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
+                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
+                     "if you intend on running FP8 sparse quantized models on Hopper.")
+    else()
+      message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
 
   #
   # Machete kernels

diff --git a/csrc/ops.h b/csrc/ops.h
@@ -163,6 +163,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
 
+bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);
+
 void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                               torch::Tensor const& b, torch::Tensor const& e,
                               torch::Tensor const& a_scales,

diff --git a/csrc/reduce.cu b/csrc/reduce.cu
@@ -0,0 +1,96 @@
+#include <cub/cub.cuh>
+#include <cuda_runtime.h>
+
+// Custom reduce implementation
+namespace vllm {
+
+namespace detail {
+
+template <typename T>
+__inline__ __device__ T _sum(T a, T b) {
+  return a + b;
+}
+
+}  // namespace detail
+
+template <typename T>
+using ReduceFnType = T (*)(T, T);
+
+template <typename T, int numLanes = 32>
+__inline__ __device__ T warpReduce(T val, ReduceFnType<T> fn) {
+  for (int mask = numLanes / 2; mask > 0; mask /= 2)
+    val = fn(val, __shfl_xor_sync(0xffffffff, val, mask));
+  return val;
+}
+
+template <typename T, int maxBlockSize = 1024>
+__inline__ __device__ T blockReduce(T val, ReduceFnType<T> fn) {
+  static __shared__ T shared[32]; // Assuming a warp size of 32
+  int lane = threadIdx.x % 32;
+  int wid = threadIdx.x / 32;
+
+  val = warpReduce<T>(val, fn);
+
+  if (lane == 0) shared[wid] = val;
+
+  __syncthreads();
+
+  val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : T(0);
+  if (wid == 0) val = warpReduce<T>(val, fn);
+
+  return val;
+}
+
+template <typename T, int maxBlockSize = 1024>
+__inline__ __device__ T blockReduceSum(T val) {
+  return blockReduce<T, maxBlockSize>(val, detail::_sum<T>);
+}
+
+}  // namespace vllm
+
+// Kernel using custom reduce
+template <typename T>
+__global__ void custom_reduce_kernel(const T* input, T* output, int n) {
+    T sum = 0;
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) {
+        sum += input[i];
+    }
+    sum = vllm::blockReduceSum<T>(sum);
+    if (threadIdx.x == 0) {
+        atomicAdd(output, sum);
+    }
+}
+
+// Kernel using CUB reduce
+template <typename T>
+__global__ void cub_reduce_kernel(const T* input, T* output, int n) {
+    typedef cub::BlockReduce<T, 256> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    T sum = 0;
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) {
+        sum += input[i];
+    }
+
+    sum = BlockReduce(temp_storage).Sum(sum);
+
+    if (threadIdx.x == 0) {
+        atomicAdd(output, sum);
+    }
+}
+
+// Helper function to launch custom reduce kernel
+template <typename T>
+void custom_reduce(const T* d_in, T* d_out, int n, cudaStream_t stream) {
+    int block_size = 256;
+    int grid_size = (n + block_size - 1) / block_size;
+    custom_reduce_kernel<<<grid_size, block_size, 0, stream>>>(d_in, d_out, n);
+}
+
+// Helper function to launch CUB reduce kernel
+template <typename T>
+void cub_reduce(const T* d_in, T* d_out, int n, cudaStream_t stream) {
+    int block_size = 256;
+    int grid_size = (n + block_size - 1) / block_size;
+    cub_reduce_kernel<<<grid_size, block_size, 0, stream>>>(d_in, d_out, n);
+}
diff --git a/csrc/sparse/cutlass/sparse_compressor_c3x.cu b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
@@ -2,6 +2,7 @@
 // clang-format off
 #include <cudaTypedefs.h>
 
+#if defined CUDA_VERSION && CUDA_VERSION >= 12020
 #include "sparse_scaled_mm_c3x.cuh"
 
 #include "cutlass/numeric_conversion.h"
@@ -160,4 +161,5 @@ bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
     return cutlass_sparse_compress<int8_t, int32_t>(a_nzs, a_meta, a);
   }
   return false;
-}
+}
+#endif
diff --git a/csrc/sparse/cutlass/sparse_compressor_entry.cu b/csrc/sparse/cutlass/sparse_compressor_entry.cu
@@ -5,7 +5,7 @@
 
 #include "cutlass_extensions/common.hpp"
 
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
 bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
                                   torch::Tensor const& a);
 #endif
@@ -28,7 +28,7 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_nzs, torch::Tensor& a_meta,
   int32_t version_num = get_sm_version_num();
 
   // Guard against compilation issues for sm90 kernels
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
   if (version_num >= 90) {
     return cutlass_sparse_compress_sm90(a_nzs, a_meta, a);
   }

diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -2,7 +2,7 @@
 // clang-format off
 #include <cudaTypedefs.h>
 
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+#if defined CUDA_VERSION && CUDA_VERSION >= 12020
 #include "sparse_scaled_mm_c3x.cuh"
 // clang-format on
 

diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -5,7 +5,18 @@
 
 #include "cutlass_extensions/common.hpp"
 
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability) {
+  // sparse CUTLASS kernels need at least
+  //   CUDA 12.2 and SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  return CUDA_VERSION >= 12020 && cuda_device_capability >= 90;
+#endif
+
+  return false;
+}
+
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
 void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                                    torch::Tensor const& b,
                                    torch::Tensor const& e,
@@ -43,7 +54,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
   int32_t version_num = get_sm_version_num();
 
   // Guard against compilation issues for sm90 kernels
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
   if (version_num >= 90) {
     cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
                                   bias);

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -321,6 +321,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
+  // given capability
+  ops.def(
+      "cutlass_sparse_scaled_mm_supported(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_sparse_scaled_mm_supported",
+           &cutlass_sparse_scaled_mm_supported);
+
   // CUTLASS sparse GEMM, supporting symmetric per-tensor or per-row/column
   // quantization, as well as bias
   ops.def(

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -552,6 +552,11 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
+def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_sparse_scaled_mm_supported(
+        cuda_device_capability)
+
+
 def cutlass_sparse_compress(a: torch.Tensor) \
     -> Tuple[torch.Tensor, torch.Tensor]:
     """

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -14,10 +14,22 @@
                                            ChannelQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
+from vllm.platforms import current_platform
 
 __all__ = ["CompressedTensors24"]
 
 
+def sparse_cutlass_supported() -> bool:
+    # sparse cutlass is not supported on Rocm
+    if current_platform.is_rocm():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_sparse_scaled_mm_supported(capability)
+
+
 class CompressedTensors24(CompressedTensorsScheme):
 
     def __init__(self,
@@ -40,6 +52,11 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
+        if not sparse_cutlass_supported():
+            raise ValueError(
+                "Sparse CUTLASS not supported. vLLM must be built with"
+                "CUDA 12.2 or later to use this feature")
+
         self.output_dtype = params_dtype
         layer.logical_widths = output_partition_sizes
         self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)