From 2d03e1d57f5debcb4c73de52804ea622a38e35fa Mon Sep 17 00:00:00 2001 From: Faraz Shahsavan Date: Thu, 14 Nov 2024 23:05:40 +0000 Subject: [PATCH] Fix cmake errors --- CMakeLists.txt | 78 +++---------------- .../cutlass_benchmarks/sparse_mm/bench_v1.py | 45 +++++------ nm_cutlass_c.cmake | 44 ----------- setup.py | 18 ++--- 4 files changed, 40 insertions(+), 145 deletions(-) delete mode 100644 nm_cutlass_c.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index aa149a4cfcf2d..67ed6e3d54d21 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,13 +208,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") FetchContent_Declare( cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git - GIT_TAG be692b48b01620eedabeef8325df5d4eeed6c2ae + GIT_TAG 1dbae0329c6d907b72b373667b4d5716bae4415f GIT_PROGRESS TRUE # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags. # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE - GIT_SHALLOW TRUE + # GIT_SHALLOW FALSE ) FetchContent_MakeAvailable(cutlass) @@ -258,11 +258,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() # - # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require + # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels + # For Hopper (c3x, i.e. CUTLASS 3.x) require # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now). cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") + set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" + "csrc/sparse/cutlass/sparse_compressor.cu" + "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") @@ -271,12 +274,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") else() if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is " + message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is " "not >= 12.0, we recommend upgrading to CUDA 12.0 or " - "later if you intend on running FP8 quantized models on " + "later if you intend on running FP8 quantized models or sparse on " "Hopper.") else() - message(STATUS "Not building scaled_mm_c3x as no compatible archs found " + message(STATUS "Not building cutlass_c3x as no compatible archs found " "in CUDA target architectures") endif() @@ -285,63 +288,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set(SCALED_MM_3X_ARCHS) endif() - # - # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require - # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now). - cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - set(SRCS "csrc/sparse/cutlass/sparse_compressor.cu") - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1") - message(STATUS "Building test_util for archs: ${SCALED_MM_3X_ARCHS}") - else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - message(STATUS "Not building test_util as CUDA Compiler version is " - "not >= 12.0, we recommend upgrading to CUDA 12.0 or " - "later if you intend on running FP8 quantized models on " - "Hopper.") - else() - message(STATUS "Not building test_util as no compatible archs found " - "in CUDA target architectures") - endif() - - # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't - # build any 3x kernels - set(SCALED_MM_3X_ARCHS) - endif() - - # - # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require - # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now). - cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1") - message(STATUS "Building test_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") - else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - message(STATUS "Not building test_mm_c3x as CUDA Compiler version is " - "not >= 12.0, we recommend upgrading to CUDA 12.0 or " - "later if you intend on running FP8 quantized models on " - "Hopper.") - else() - message(STATUS "Not building test_mm_c3x as no compatible archs found " - "in CUDA target architectures") - endif() - - # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't - # build any 3x kernels - set(SCALED_MM_3X_ARCHS) - endif() - - # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. @@ -458,8 +404,8 @@ define_gpu_extension_target( # Setting this variable sidesteps the issue by calling the driver directly. target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) -include(nm_cutlass_c.cmake) -build_nm_cutlass_c() +# include(nm_cutlass_c.cmake) +# build_nm_cutlass_c() # # _moe_C extension diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py index 9c516fc6762a7..22616c2359b74 100644 --- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py +++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py @@ -82,8 +82,13 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str) -> Iterable[TMeasurement]: assert dtype == torch.float8_e4m3fn - a_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k) + # Create tensors + b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k) + aT = a.t() + bT = b.t() + bf16_a = a.to(dtype=torch.bfloat16) + bf16_bT = bT.to(dtype=torch.bfloat16) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) @@ -94,7 +99,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, timers.append( bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", torch.mm, a.to(dtype=torch.bfloat16, device="cuda"), - b.to(dtype=torch.bfloat16, device="cuda"))) + bT.to(dtype=torch.bfloat16, device="cuda"))) # pytorch impl: bf16 output, without fp8 fast accum timers.append( @@ -103,7 +108,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, "pytorch_fp8_fp8_bf16_scaled_mm", torch._scaled_mm, a, - b, + bT, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)) @@ -115,7 +120,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", torch._scaled_mm, a, - b, + bT, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16, @@ -128,7 +133,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, "pytorch_fp8_fp8_fp16_scaled_mm", torch._scaled_mm, a, - b, + bT, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.float16)) @@ -140,7 +145,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum", torch._scaled_mm, a, - b, + bT, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.float16, @@ -149,24 +154,12 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, # cutlass impl: bf16 output timers.append( bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm", - ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, + ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)) # cutlass impl: fp16 output timers.append( bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm", - ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16)) - - # cutlass impl: bf16 output, with bias - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias", - ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16, - bias)) - - # cutlass impl: fp16 output, with bias - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias", - ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16, - bias.to(dtype=torch.float16))) + ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16)) return timers @@ -307,12 +300,12 @@ def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str, def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str) -> Iterable[TMeasurement]: - if dtype == torch.int8: - return bench_int8(dtype, m, k, n, label, sub_label) + # if dtype == torch.int8: + # return bench_int8(dtype, m, k, n, label, sub_label) if dtype == torch.float8_e4m3fn: return bench_fp8(dtype, m, k, n, label, sub_label) - if dtype == torch.float16: - return bench_fp16(dtype, m, k, n, label, sub_label) - if dtype == torch.bfloat16: - return bench_bf16(dtype, m, k, n, label, sub_label) + # if dtype == torch.float16: + # return bench_fp16(dtype, m, k, n, label, sub_label) + # if dtype == torch.bfloat16: + # return bench_bf16(dtype, m, k, n, label, sub_label) raise ValueError("unsupported type") diff --git a/nm_cutlass_c.cmake b/nm_cutlass_c.cmake deleted file mode 100644 index 8228c890244af..0000000000000 --- a/nm_cutlass_c.cmake +++ /dev/null @@ -1,44 +0,0 @@ -function(build_nm_cutlass_c) - - message (STATUS "Project root dir ${PROJECT_ROOT_DIR}") - file(GLOB full_path_generated_dirs LIST_DIRECTORIES true "${PROJECT_ROOT_DIR}/csrc/sparse/cutlass/generator/generated/*") - - message (STATUS "fullpath generated dirs ${full_path_generated_dirs}") - - set(generated_dirs) - foreach(d ${full_path_generated_dirs}) - get_filename_component(d_name ${d} NAME) - list(APPEND generated_dirs ${d_name}) - endforeach() - - set(NM_CUTLASS_C_ARCHS "9.0;9.0a") - - foreach(d ${generated_dirs}) - - set(SRCS_DIR "csrc/sparse/cutlass/generator/generated/${d}") - set(SRCS) - file(GLOB SRCS "${SRCS_DIR}/*cu") - list(APPEND SRCS "${SRCS_DIR}/torch_bindings.cpp") - - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${NM_CUTLASS_C_ARCHS}") - - set(EXT_NAME "_nm_cutlass_${d}_C") - message(STATUS "Enabling ${EXT_NAME} extension.") - define_gpu_extension_target( - ${EXT_NAME} - DESTINATION vllm - LANGUAGE ${VLLM_GPU_LANG} - SOURCES ${SRCS} - COMPILE_FLAGS ${VLLM_GPU_FLAGS} - ARCHITECTURES ${VLLM_GPU_ARCHES} - INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR} - USE_SABI 3 - WITH_SOABI) - - target_compile_definitions(${EXT_NAME} PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) - - endforeach() - -endfunction() diff --git a/setup.py b/setup.py index 43c3b3a268fa2..14147ca21a39f 100644 --- a/setup.py +++ b/setup.py @@ -462,15 +462,15 @@ def _read_requirements(filename: str) -> List[str]: ext_modules.append( CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c")) -if _is_cuda(): - sparse_mm_generated_dir = './csrc/sparse/cutlass/generator/generated/' - sparse_mm_generated_dirs = \ - [x for x in Path(sparse_mm_generated_dir).iterdir() if x.is_dir()] - sparse_mm_generated_dir_names = [x.name for x in sparse_mm_generated_dirs] - nm_cutlass_extensions = \ - [f"vllm._nm_cutlass_{x}_C" for x in sparse_mm_generated_dir_names] - for x in nm_cutlass_extensions: - ext_modules.append(CMakeExtension(name=x)) +# if _is_cuda(): +# sparse_mm_generated_dir = './csrc/sparse/cutlass/generator/generated/' +# sparse_mm_generated_dirs = \ +# [x for x in Path(sparse_mm_generated_dir).iterdir() if x.is_dir()] +# sparse_mm_generated_dir_names = [x.name for x in sparse_mm_generated_dirs] +# nm_cutlass_extensions = \ +# [f"vllm._nm_cutlass_{x}_C" for x in sparse_mm_generated_dir_names] +# for x in nm_cutlass_extensions: +# ext_modules.append(CMakeExtension(name=x)) if _build_custom_ops(): ext_modules.append(CMakeExtension(name="vllm._C"))