fix(cmake): building dynamic library for specified GPU architectures …

…and support multi threads compile (#164) Added a CMake variable `USER_CUDA_ARCH_LIST` to allow users to specify CUDA architectures manually. If this variable is not set, CMake will automatically detect the CUDA architecture of the underlying machine and build the dynamic library accordingly. Set the `TORCH_CUDA_ARCH_LIST` environment variable to the desired architecture. CMake will automatically read this value from the environment. for example: dynamic library is built for sm_75 and sm_80: ```bash export TORCH_CUDA_ARCH_LIST="7.5 8.0" python3 setup.py develop 2>&1 | tee build.log ``` dynamic library is built for local GPU: ```bash python3 setup.py develop 2>&1 | tee build.log ```
microsoft · Jan 16, 2025 · 5eda679 · 5eda679
1 parent f689b82
commit 5eda679
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 9 deletions.
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
@@ -18,6 +18,10 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 # Set host compiler flags. Enable all warnings and treat them as errors
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall")
 
+set(USER_CUDA_ARCH_LIST
+    ""
+    CACHE STRING "User-specified CUDA Architectures")
+
 find_package(CUDAToolkit QUIET REQUIRED)
 enable_language(CUDA)
 set(CMAKE_CUDA on)
@@ -39,10 +43,20 @@ find_package(Torch REQUIRED)
 message(STATUS "Torch include include_directories: " ${TORCH_INCLUDE_DIRS})
 include_directories(${TORCH_INCLUDE_DIRS})
 
-# let cmake automatically detect the current CUDA architecture to avoid
-# generating device codes for all possible architectures
-set(CMAKE_CUDA_ARCHITECTURES OFF)
+if(USER_CUDA_ARCH_LIST)
+  message(STATUS "User specified CUDA architectures: ${USER_CUDA_ARCH_LIST}")
+  cuda_select_nvcc_arch_flags(ARCH_LIST ${USER_CUDA_ARCH_LIST})
+  # Alawyas append the user-specified CUDA architectures to NVCC flags
+  list(APPEND CUDA_NVCC_FLAGS ${ARCH_LIST})
+else()
+  # let cmake automatically detect the current CUDA architecture to avoid
+  # generating device codes for all possible architectures
+  message(STATUS "No user specified CUDA architectures, cmake will detect the "
+                 "local CUDA architecture.")
+endif()
+
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  --Werror all-warnings")
+
 # Set the CUDA_PROPAGATE_HOST_FLAGS to OFF to avoid passing host compiler flags
 # to the device compiler
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
@@ -64,10 +78,16 @@ set(CUDA_NVCC_FLAGS
     -U__CUDA_NO_BFLOAT162_CONVERSIONS__)
 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math)
 
-if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
-  add_definitions("-DENABLE_BF16")
-  message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} "
-          "is greater or equal than 11.0, enable -DENABLE_BF16 flag.")
+if(DEFINED NVCC_THREADS AND (NOT CUDA_VERSION VERSION_LESS 11.3))
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "--threads ${NVCC_THREADS}")
+endif()
+
+message(STATUS "NVCC FLAGS = ${CUDA_NVCC_FLAGS}")
+
+if(${CUDA_VERSION_MAJOR} VERSION_LESS "11")
+  message(
+    FATAL_ERROR "CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} "
+                "should be greater or equal than 11.0 to enable bf16 support.")
 endif()
 
 message(STATUS "CUDA detected: " ${CUDA_VERSION})

diff --git a/setup.py b/setup.py
@@ -2,14 +2,15 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
-
 import os
 import subprocess
 from pathlib import Path
 
+from packaging.version import Version, parse
 from setuptools import Command, Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
 from setuptools.command.develop import develop
+from torch.utils.cpp_extension import CUDA_HOME
 
 cur_path = Path(__file__).parent
 
@@ -30,6 +31,23 @@ def get_requirements():
     return requirements
 
 
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
+                                         universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    bare_metal_version = parse(output[release_idx].split(",")[0])
+
+    return raw_output, bare_metal_version
+
+
+def nvcc_threads():
+    _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+    if bare_metal_version >= Version("11.2"):
+        nvcc_threads = os.getenv("NVCC_THREADS") or (os.cpu_count() // 2)
+        return nvcc_threads
+
+
 class CMakeExtension(Extension):
     """ specify the root folder of the CMake projects"""
 
@@ -56,6 +74,10 @@ def build_extension(self, ext: CMakeExtension) -> None:
         ) if self.debug is None else self.debug
         cfg = "Debug" if debug else "Release"
 
+        # Set CUDA_ARCH_LIST to build the shared library
+        # for the specified GPU architectures.
+        arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+
         parallel_level = os.environ.get("CMAKE_BUILD_PARALLEL_LEVEL", None)
         if parallel_level is not None:
             self.parallel = int(parallel_level)
@@ -74,7 +96,8 @@ def build_extension(self, ext: CMakeExtension) -> None:
                     cfg.upper(), extdir
                 ), "-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}".format(
                     cfg.upper(), self.build_temp
-                )
+                ), "-DUSER_CUDA_ARCH_LIST={}".format(arch_list) if arch_list
+                else "", "-DNVCC_THREADS={}".format(nvcc_threads())
             ]
 
             # Adding CMake arguments set as environment variable