Skip to content

Commit

Permalink
fix(cmake): building dynamic library for specified GPU architectures …
Browse files Browse the repository at this point in the history
…and support multi threads compile (#164)

Added a CMake variable `USER_CUDA_ARCH_LIST` to allow users to specify
CUDA architectures manually.

If this variable is not set, CMake will automatically detect the CUDA
architecture of the underlying machine and build the dynamic library
accordingly.

Set the `TORCH_CUDA_ARCH_LIST` environment variable to the desired
architecture. CMake will automatically read this value from the
environment.

for example:

dynamic library is built for sm_75 and sm_80:

```bash
export TORCH_CUDA_ARCH_LIST="7.5 8.0"
python3 setup.py develop 2>&1 | tee build.log
```

dynamic library is built for local GPU:
```bash
python3 setup.py develop 2>&1 | tee build.log
```
  • Loading branch information
lcy-seso authored Jan 16, 2025
1 parent f689b82 commit 5eda679
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 9 deletions.
34 changes: 27 additions & 7 deletions cmake/generic.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
# Set host compiler flags. Enable all warnings and treat them as errors
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall")

set(USER_CUDA_ARCH_LIST
""
CACHE STRING "User-specified CUDA Architectures")

find_package(CUDAToolkit QUIET REQUIRED)
enable_language(CUDA)
set(CMAKE_CUDA on)
Expand All @@ -39,10 +43,20 @@ find_package(Torch REQUIRED)
message(STATUS "Torch include include_directories: " ${TORCH_INCLUDE_DIRS})
include_directories(${TORCH_INCLUDE_DIRS})

# let cmake automatically detect the current CUDA architecture to avoid
# generating device codes for all possible architectures
set(CMAKE_CUDA_ARCHITECTURES OFF)
if(USER_CUDA_ARCH_LIST)
message(STATUS "User specified CUDA architectures: ${USER_CUDA_ARCH_LIST}")
cuda_select_nvcc_arch_flags(ARCH_LIST ${USER_CUDA_ARCH_LIST})
# Alawyas append the user-specified CUDA architectures to NVCC flags
list(APPEND CUDA_NVCC_FLAGS ${ARCH_LIST})
else()
# let cmake automatically detect the current CUDA architecture to avoid
# generating device codes for all possible architectures
message(STATUS "No user specified CUDA architectures, cmake will detect the "
"local CUDA architecture.")
endif()

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror all-warnings")

# Set the CUDA_PROPAGATE_HOST_FLAGS to OFF to avoid passing host compiler flags
# to the device compiler
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
Expand All @@ -64,10 +78,16 @@ set(CUDA_NVCC_FLAGS
-U__CUDA_NO_BFLOAT162_CONVERSIONS__)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math)

if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
add_definitions("-DENABLE_BF16")
message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} "
"is greater or equal than 11.0, enable -DENABLE_BF16 flag.")
if(DEFINED NVCC_THREADS AND (NOT CUDA_VERSION VERSION_LESS 11.3))
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "--threads ${NVCC_THREADS}")
endif()

message(STATUS "NVCC FLAGS = ${CUDA_NVCC_FLAGS}")

if(${CUDA_VERSION_MAJOR} VERSION_LESS "11")
message(
FATAL_ERROR "CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} "
"should be greater or equal than 11.0 to enable bf16 support.")
endif()

message(STATUS "CUDA detected: " ${CUDA_VERSION})
Expand Down
27 changes: 25 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------

import os
import subprocess
from pathlib import Path

from packaging.version import Version, parse
from setuptools import Command, Extension, find_packages, setup
from setuptools.command.build_ext import build_ext
from setuptools.command.develop import develop
from torch.utils.cpp_extension import CUDA_HOME

cur_path = Path(__file__).parent

Expand All @@ -30,6 +31,23 @@ def get_requirements():
return requirements


def get_cuda_bare_metal_version(cuda_dir):
raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
universal_newlines=True)
output = raw_output.split()
release_idx = output.index("release") + 1
bare_metal_version = parse(output[release_idx].split(",")[0])

return raw_output, bare_metal_version


def nvcc_threads():
_, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
if bare_metal_version >= Version("11.2"):
nvcc_threads = os.getenv("NVCC_THREADS") or (os.cpu_count() // 2)
return nvcc_threads


class CMakeExtension(Extension):
""" specify the root folder of the CMake projects"""

Expand All @@ -56,6 +74,10 @@ def build_extension(self, ext: CMakeExtension) -> None:
) if self.debug is None else self.debug
cfg = "Debug" if debug else "Release"

# Set CUDA_ARCH_LIST to build the shared library
# for the specified GPU architectures.
arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)

parallel_level = os.environ.get("CMAKE_BUILD_PARALLEL_LEVEL", None)
if parallel_level is not None:
self.parallel = int(parallel_level)
Expand All @@ -74,7 +96,8 @@ def build_extension(self, ext: CMakeExtension) -> None:
cfg.upper(), extdir
), "-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}".format(
cfg.upper(), self.build_temp
)
), "-DUSER_CUDA_ARCH_LIST={}".format(arch_list) if arch_list
else "", "-DNVCC_THREADS={}".format(nvcc_threads())
]

# Adding CMake arguments set as environment variable
Expand Down

0 comments on commit 5eda679

Please sign in to comment.