diff --git a/csrc/ops.h b/csrc/ops.h index 171b70eb80aee..363ddec3d0729 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -155,8 +155,6 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a, c10::optional const& azp, c10::optional const& bias); -bool cutlass_scaled_sparse_mm_supports_fp8(int64_t cuda_device_capability); - void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& e, torch::Tensor const& b, torch::Tensor const& a_scales, diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu index 9e23df5a05d69..5075c342098ba 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu @@ -12,22 +12,6 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a, c10::optional const& bias); #endif -bool cutlass_scaled_sparse_mm_supports_fp8(int64_t cuda_device_capability) { - // CUTLASS FP8 kernels need at least - // CUDA 12.0 on SM90 systems (Hopper) - // CUDA 12.4 on SM89 systems (Lovelace) - -#if defined CUDA_VERSION - if (cuda_device_capability >= 90) { - return CUDA_VERSION >= 12000; - } else if (cuda_device_capability >= 89) { - return CUDA_VERSION >= 12040; - } -#endif - - return false; -} - int32_t test_get_sm_version_num() { int32_t major_capability, minor_capability; cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 667aa94db3218..3cbc843dc501a 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -313,7 +313,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool"); ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8); - // Test + // CUTLASS sparse GEMM, supporting symmetric per-tensor or per-row/column + // quantization, as well as bias ops.def( "cutlass_scaled_sparse_mm(Tensor! out, Tensor a," " Tensor e," @@ -321,14 +322,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor b_scales, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm); - // Test - ops.def( - "cutlass_scaled_sparse_mm_supports_fp8(int cuda_device_capability) -> " - "bool"); - ops.impl("cutlass_scaled_sparse_mm_supports_fp8", - &cutlass_scaled_sparse_mm_supports_fp8); - - // Test + // CUTLASS sparse matrix compressor ops.def( "cutlass_compress_entry(Tensor! a_compressed, Tensor! e," " Tensor a) -> bool"); diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 6087247de5a94..d99d2340275af 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -532,11 +532,6 @@ def cutlass_scaled_mm_azp(a: torch.Tensor, return out -def cutlass_scaled_sparse_mm_supports_fp8(cuda_device_capability: int) -> bool: - return torch.ops._C.cutlass_scaled_sparse_mm_supports_fp8( - cuda_device_capability) - - def cutlass_compress_entry(a: torch.Tensor) \ -> Tuple[torch.Tensor, torch.Tensor]: assert (a.dtype is torch.int8 or a.dtype is torch.float8_e4m3fn or \