diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat index 7ebaaa051ee56..070e7a14687ee 100644 --- a/.ci/pytorch/win-test-helpers/build_pytorch.bat +++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat @@ -66,6 +66,13 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH% set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH% set CUDNN_ROOT_DIR=%CUDA_PATH% +set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt +set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH% + +set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64 +set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH% +set CUDNN_ROOT_DIR=%CUDA_PATH% +set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH% :cuda_build_end diff --git a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat index 4a464d6b5786a..4934804d2daea 100644 --- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -40,6 +40,7 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH% set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH% set CUDNN_ROOT_DIR=%CUDA_PATH% +set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH% set NUMBAPRO_CUDALIB=%CUDA_PATH%\bin set NUMBAPRO_LIBDEVICE=%CUDA_PATH%\nvvm\libdevice diff --git a/.ci/pytorch/win-test-helpers/test_custom_backend.bat b/.ci/pytorch/win-test-helpers/test_custom_backend.bat index bc15b19ed87be..e209abacca9b9 100644 --- a/.ci/pytorch/win-test-helpers/test_custom_backend.bat +++ b/.ci/pytorch/win-test-helpers/test_custom_backend.bat @@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1 :: Run tests C++-side and load the exported script module. cd build -set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH% +set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH% test_custom_backend.exe model.pt if ERRORLEVEL 1 exit /b 1 diff --git a/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat b/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat index bc5ffd87e0dc3..08979059fb283 100644 --- a/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat +++ b/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat @@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1 :: Run tests C++-side and load the exported script module. cd build -set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH% +set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH% test_custom_ops.exe model.pt if ERRORLEVEL 1 exit /b 1 diff --git a/.gitmodules b/.gitmodules index 32c7a83705ad8..7e1b09e591cd5 100644 --- a/.gitmodules +++ b/.gitmodules @@ -149,6 +149,3 @@ [submodule "third_party/mimalloc"] path = third_party/mimalloc url = https://github.com/microsoft/mimalloc.git -[submodule "third_party/NVTX"] - path = third_party/NVTX - url = https://github.com/NVIDIA/NVTX.git diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index ba75466ff7572..748363725bcc3 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1548,8 +1548,7 @@ if(USE_CUDA) target_link_libraries(torch_cpu PRIVATE torch::cudart) endif() target_link_libraries(torch_cuda INTERFACE torch::cudart) - target_link_libraries(torch_cuda PUBLIC c10_cuda) - target_link_libraries(torch_cuda PRIVATE torch::nvtoolsext) + target_link_libraries(torch_cuda PUBLIC c10_cuda torch::nvtoolsext) target_include_directories( torch_cuda INTERFACE $) @@ -1606,7 +1605,7 @@ if(BUILD_SHARED_LIBS) # not find them, because they're usually in non-standard locations) if(USE_CUDA) target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}) - target_link_libraries(torch_global_deps torch::cudart) + target_link_libraries(torch_global_deps torch::cudart torch::nvtoolsext) endif() if(USE_TBB) target_link_libraries(torch_global_deps TBB::tbb) diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py index 283f08cd1f546..888d286458a3a 100644 --- a/caffe2/python/__init__.py +++ b/caffe2/python/__init__.py @@ -26,6 +26,13 @@ th_root = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'torch') th_dll_path = os.path.join(th_root, 'lib') + if not os.path.exists(os.path.join(th_dll_path, 'nvToolsExt64_1.dll')) and \ + not os.path.exists(os.path.join(py_dll_path, 'nvToolsExt64_1.dll')): + nvtoolsext_dll_path = os.path.join( + os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt'), 'bin', 'x64') + else: + nvtoolsext_dll_path = '' + import importlib.util import glob spec = importlib.util.spec_from_file_location('torch_version', os.path.join(th_root, 'version.py')) @@ -43,7 +50,7 @@ import ctypes kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True) - dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, cuda_path])) + dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path])) with_load_library_flags = hasattr(kernel32, 'AddDllDirectory') prev_error_mode = kernel32.SetErrorMode(0x0001) diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in index 37344dcc62e03..6d518a1489626 100644 --- a/cmake/TorchConfig.cmake.in +++ b/cmake/TorchConfig.cmake.in @@ -129,18 +129,30 @@ endif() if(@USE_CUDA@) if(MSVC) - set(TORCH_CUDA_LIBRARIES ${CUDA_LIBRARIES}) + if(NOT NVTOOLEXT_HOME) + set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt") + endif() + if(DEFINED ENV{NVTOOLSEXT_PATH}) + set(NVTOOLEXT_HOME $ENV{NVTOOLSEXT_PATH}) + endif() + set(TORCH_CUDA_LIBRARIES + ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib + ${CUDA_LIBRARIES}) + list(APPEND TORCH_INCLUDE_DIRS ${NVTOOLEXT_HOME}/include) find_library(CAFFE2_NVRTC_LIBRARY caffe2_nvrtc PATHS "${TORCH_INSTALL_PREFIX}/lib") list(APPEND TORCH_CUDA_LIBRARIES ${CAFFE2_NVRTC_LIBRARY}) elseif(APPLE) set(TORCH_CUDA_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib ${CUDA_LIBRARIES}) else() + find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/) set(TORCH_CUDA_LIBRARIES ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB} + ${LIBNVTOOLSEXT} ${CUDA_LIBRARIES}) endif() if(@BUILD_SHARED_LIBS@) diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake index 43983400cacc8..c7595774d810b 100644 --- a/cmake/public/cuda.cmake +++ b/cmake/public/cuda.cmake @@ -66,6 +66,10 @@ if(NOT CMAKE_CUDA_COMPILER_VERSION VERSION_EQUAL CUDAToolkit_VERSION) "V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIRS}'") endif() +if(NOT TARGET CUDA::nvToolsExt) + message(FATAL_ERROR "Failed to find nvToolsExt") +endif() + message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION}) message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE}) message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR}) @@ -210,15 +214,10 @@ else() endif() # nvToolsExt -find_path(nvtx3_dir NAMES nvtx3 PATHS "${CUDA_INCLUDE_DIRS}" "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH) -find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir) -if(nvtx3_FOUND) - add_library(torch::nvtoolsext INTERFACE IMPORTED) - target_include_directories(torch::nvtoolsext INTERFACE "${nvtx3_dir}") -else() - message(WARNING "Cannot find NVTX3") -endif() - +add_library(torch::nvtoolsext INTERFACE IMPORTED) +set_property( + TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES + CUDA::nvToolsExt) # cublas add_library(caffe2::cublas INTERFACE IMPORTED) diff --git a/setup.py b/setup.py index f4ed3d8945554..5e59c4a0986a1 100644 --- a/setup.py +++ b/setup.py @@ -189,6 +189,8 @@ # NCCL_INCLUDE_DIR # specify where nccl is installed # +# NVTOOLSEXT_PATH (Windows only) +# specify where nvtoolsext is installed # # ACL_ROOT_DIR # specify where Compute Library is installed diff --git a/third_party/NVTX b/third_party/NVTX deleted file mode 160000 index e170594ac7cf1..0000000000000 --- a/third_party/NVTX +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e170594ac7cf1dac584da473d4ca9301087090c1 diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 8dd044f0c9ebf..b0d7bd842d335 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -130,6 +130,7 @@ if(USE_CUDA) list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::cudnn) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDNN) endif() + list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext) endif() diff --git a/torch/__init__.py b/torch/__init__.py index cdd9ddcffae56..0eaf86277b574 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -80,6 +80,12 @@ def _running_with_deploy(): dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, base_py_dll_path])) + if all(not os.path.exists(os.path.join(p, 'nvToolsExt64_1.dll')) for p in dll_paths): + nvtoolsext_dll_path = os.path.join( + os.getenv('NVTOOLSEXT_PATH', os.path.join(pfiles_path, 'NVIDIA Corporation', 'NvToolsExt')), 'bin', 'x64') + else: + nvtoolsext_dll_path = '' + from .version import cuda as cuda_version import glob if cuda_version and all(not glob.glob(os.path.join(p, 'cudart64*.dll')) for p in dll_paths): @@ -90,7 +96,7 @@ def _running_with_deploy(): else: cuda_path = '' - dll_paths.extend(filter(os.path.exists, [cuda_path])) + dll_paths.extend(filter(os.path.exists, [nvtoolsext_dll_path, cuda_path])) kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True) with_load_library_flags = hasattr(kernel32, 'AddDllDirectory') @@ -182,6 +188,7 @@ def _load_global_deps() -> None: 'cusolver': 'libcusolver.so.*[0-9]', 'cusparse': 'libcusparse.so.*[0-9]', 'nccl': 'libnccl.so.*[0-9]', + 'nvtx': 'libnvToolsExt.so.*[0-9]', } is_cuda_lib_err = [lib for lib in cuda_libs.values() if(lib.split('.')[0] in err.args[0])] if not is_cuda_lib_err: diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp index b813e8fad8389..4fb72c5f79be2 100644 --- a/torch/csrc/cuda/shared/nvtx.cpp +++ b/torch/csrc/cuda/shared/nvtx.cpp @@ -1,7 +1,7 @@ #ifdef _WIN32 #include // _wgetenv for nvtx #endif -#include +#include #include namespace torch::cuda::shared { @@ -9,7 +9,7 @@ namespace torch::cuda::shared { void initNvtxBindings(PyObject* module) { auto m = py::handle(module).cast(); - auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings"); + auto nvtx = m.def_submodule("_nvtx", "libNvToolsExt.so bindings"); nvtx.def("rangePushA", nvtxRangePushA); nvtx.def("rangePop", nvtxRangePop); nvtx.def("rangeStartA", nvtxRangeStartA); diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp index 2ea4a975ba851..7c8c13034df33 100644 --- a/torch/csrc/profiler/stubs/cuda.cpp +++ b/torch/csrc/profiler/stubs/cuda.cpp @@ -1,6 +1,6 @@ #include -#include +#include #include #include diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py index b363097be3ca6..fa727a7c078c8 100644 --- a/torch/utils/hipify/cuda_to_hip_mappings.py +++ b/torch/utils/hipify/cuda_to_hip_mappings.py @@ -647,7 +647,7 @@ ("cub/device/device_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), ("cub/device/device_select.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), - ("nvtx3/nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)), + ("nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)), ("nvml.h", ("rocm_smi/rocm_smi.h", CONV_INCLUDE, API_ROCMSMI)), ] )