diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat
index 7ebaaa051ee56..070e7a14687ee 100644
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@@ -66,6 +66,13 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
+set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
+
+set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
+set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
+set CUDNN_ROOT_DIR=%CUDA_PATH%
+set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 
 :cuda_build_end
diff --git a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
index 4a464d6b5786a..4934804d2daea 100644
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -40,6 +40,7 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
+set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 set NUMBAPRO_CUDALIB=%CUDA_PATH%\bin
 set NUMBAPRO_LIBDEVICE=%CUDA_PATH%\nvvm\libdevice
diff --git a/.ci/pytorch/win-test-helpers/test_custom_backend.bat b/.ci/pytorch/win-test-helpers/test_custom_backend.bat
index bc15b19ed87be..e209abacca9b9 100644
--- a/.ci/pytorch/win-test-helpers/test_custom_backend.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_backend.bat
@@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1
 
 :: Run tests C++-side and load the exported script module.
 cd build
-set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
 test_custom_backend.exe model.pt
 if ERRORLEVEL 1 exit /b 1
diff --git a/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat b/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
index bc5ffd87e0dc3..08979059fb283 100644
--- a/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
@@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1
 
 :: Run tests C++-side and load the exported script module.
 cd build
-set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
 test_custom_ops.exe model.pt
 if ERRORLEVEL 1 exit /b 1
diff --git a/.gitmodules b/.gitmodules
index 32c7a83705ad8..7e1b09e591cd5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -149,6 +149,3 @@
 [submodule "third_party/mimalloc"]
 	path = third_party/mimalloc
 	url = https://github.com/microsoft/mimalloc.git
-[submodule "third_party/NVTX"]
-	path = third_party/NVTX
-	url = https://github.com/NVIDIA/NVTX.git
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index ba75466ff7572..748363725bcc3 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1548,8 +1548,7 @@ if(USE_CUDA)
     target_link_libraries(torch_cpu PRIVATE torch::cudart)
   endif()
   target_link_libraries(torch_cuda INTERFACE torch::cudart)
-  target_link_libraries(torch_cuda PUBLIC c10_cuda)
-  target_link_libraries(torch_cuda PRIVATE torch::nvtoolsext)
+  target_link_libraries(torch_cuda PUBLIC c10_cuda torch::nvtoolsext)
 
   target_include_directories(
       torch_cuda INTERFACE $<INSTALL_INTERFACE:include>)
@@ -1606,7 +1605,7 @@ if(BUILD_SHARED_LIBS)
   # not find them, because they're usually in non-standard locations)
   if(USE_CUDA)
     target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
-    target_link_libraries(torch_global_deps torch::cudart)
+    target_link_libraries(torch_global_deps torch::cudart torch::nvtoolsext)
   endif()
   if(USE_TBB)
     target_link_libraries(torch_global_deps TBB::tbb)
diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
index 283f08cd1f546..888d286458a3a 100644
--- a/caffe2/python/__init__.py
+++ b/caffe2/python/__init__.py
@@ -26,6 +26,13 @@
     th_root = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'torch')
     th_dll_path = os.path.join(th_root, 'lib')
 
+    if not os.path.exists(os.path.join(th_dll_path, 'nvToolsExt64_1.dll')) and \
+            not os.path.exists(os.path.join(py_dll_path, 'nvToolsExt64_1.dll')):
+        nvtoolsext_dll_path = os.path.join(
+            os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt'), 'bin', 'x64')
+    else:
+        nvtoolsext_dll_path = ''
+
     import importlib.util
     import glob
     spec = importlib.util.spec_from_file_location('torch_version', os.path.join(th_root, 'version.py'))
@@ -43,7 +50,7 @@
 
     import ctypes
     kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
-    dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, cuda_path]))
+    dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path]))
     with_load_library_flags = hasattr(kernel32, 'AddDllDirectory')
     prev_error_mode = kernel32.SetErrorMode(0x0001)
 
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 37344dcc62e03..6d518a1489626 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -129,18 +129,30 @@ endif()
 
 if(@USE_CUDA@)
   if(MSVC)
-    set(TORCH_CUDA_LIBRARIES ${CUDA_LIBRARIES})
+    if(NOT NVTOOLEXT_HOME)
+      set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt")
+    endif()
+    if(DEFINED ENV{NVTOOLSEXT_PATH})
+      set(NVTOOLEXT_HOME $ENV{NVTOOLSEXT_PATH})
+    endif()
+    set(TORCH_CUDA_LIBRARIES
+      ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
+      ${CUDA_LIBRARIES})
+    list(APPEND TORCH_INCLUDE_DIRS ${NVTOOLEXT_HOME}/include)
     find_library(CAFFE2_NVRTC_LIBRARY caffe2_nvrtc PATHS "${TORCH_INSTALL_PREFIX}/lib")
     list(APPEND TORCH_CUDA_LIBRARIES ${CAFFE2_NVRTC_LIBRARY})
   elseif(APPLE)
     set(TORCH_CUDA_LIBRARIES
       ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib
       ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib
       ${CUDA_LIBRARIES})
   else()
+    find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/)
     set(TORCH_CUDA_LIBRARIES
       ${CUDA_CUDA_LIB}
       ${CUDA_NVRTC_LIB}
+      ${LIBNVTOOLSEXT}
       ${CUDA_LIBRARIES})
   endif()
   if(@BUILD_SHARED_LIBS@)
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 43983400cacc8..c7595774d810b 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -66,6 +66,10 @@ if(NOT CMAKE_CUDA_COMPILER_VERSION VERSION_EQUAL CUDAToolkit_VERSION)
                       "V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIRS}'")
 endif()
 
+if(NOT TARGET CUDA::nvToolsExt)
+  message(FATAL_ERROR "Failed to find nvToolsExt")
+endif()
+
 message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
 message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
 message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
@@ -210,15 +214,10 @@ else()
 endif()
 
 # nvToolsExt
-find_path(nvtx3_dir NAMES nvtx3 PATHS "${CUDA_INCLUDE_DIRS}" "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH)
-find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
-if(nvtx3_FOUND)
-  add_library(torch::nvtoolsext INTERFACE IMPORTED)
-  target_include_directories(torch::nvtoolsext INTERFACE "${nvtx3_dir}")
-else()
-  message(WARNING "Cannot find NVTX3")
-endif()
-
+add_library(torch::nvtoolsext INTERFACE IMPORTED)
+set_property(
+    TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES
+    CUDA::nvToolsExt)
 
 # cublas
 add_library(caffe2::cublas INTERFACE IMPORTED)
diff --git a/setup.py b/setup.py
index f4ed3d8945554..5e59c4a0986a1 100644
--- a/setup.py
+++ b/setup.py
@@ -189,6 +189,8 @@
 #   NCCL_INCLUDE_DIR
 #     specify where nccl is installed
 #
+#   NVTOOLSEXT_PATH (Windows only)
+#     specify where nvtoolsext is installed
 #
 #   ACL_ROOT_DIR
 #     specify where Compute Library is installed
diff --git a/third_party/NVTX b/third_party/NVTX
deleted file mode 160000
index e170594ac7cf1..0000000000000
--- a/third_party/NVTX
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit e170594ac7cf1dac584da473d4ca9301087090c1
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 8dd044f0c9ebf..b0d7bd842d335 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -130,6 +130,7 @@ if(USE_CUDA)
         list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::cudnn)
         list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDNN)
     endif()
+
     list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
 endif()
 
diff --git a/torch/__init__.py b/torch/__init__.py
index cdd9ddcffae56..0eaf86277b574 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -80,6 +80,12 @@ def _running_with_deploy():
 
     dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, base_py_dll_path]))
 
+    if all(not os.path.exists(os.path.join(p, 'nvToolsExt64_1.dll')) for p in dll_paths):
+        nvtoolsext_dll_path = os.path.join(
+            os.getenv('NVTOOLSEXT_PATH', os.path.join(pfiles_path, 'NVIDIA Corporation', 'NvToolsExt')), 'bin', 'x64')
+    else:
+        nvtoolsext_dll_path = ''
+
     from .version import cuda as cuda_version
     import glob
     if cuda_version and all(not glob.glob(os.path.join(p, 'cudart64*.dll')) for p in dll_paths):
@@ -90,7 +96,7 @@ def _running_with_deploy():
     else:
         cuda_path = ''
 
-    dll_paths.extend(filter(os.path.exists, [cuda_path]))
+    dll_paths.extend(filter(os.path.exists, [nvtoolsext_dll_path, cuda_path]))
 
     kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
     with_load_library_flags = hasattr(kernel32, 'AddDllDirectory')
@@ -182,6 +188,7 @@ def _load_global_deps() -> None:
             'cusolver': 'libcusolver.so.*[0-9]',
             'cusparse': 'libcusparse.so.*[0-9]',
             'nccl': 'libnccl.so.*[0-9]',
+            'nvtx': 'libnvToolsExt.so.*[0-9]',
         }
         is_cuda_lib_err = [lib for lib in cuda_libs.values() if(lib.split('.')[0] in err.args[0])]
         if not is_cuda_lib_err:
diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp
index b813e8fad8389..4fb72c5f79be2 100644
--- a/torch/csrc/cuda/shared/nvtx.cpp
+++ b/torch/csrc/cuda/shared/nvtx.cpp
@@ -1,7 +1,7 @@
 #ifdef _WIN32
 #include <wchar.h> // _wgetenv for nvtx
 #endif
-#include <nvtx3/nvToolsExt.h>
+#include <nvToolsExt.h>
 #include <torch/csrc/utils/pybind.h>
 
 namespace torch::cuda::shared {
@@ -9,7 +9,7 @@ namespace torch::cuda::shared {
 void initNvtxBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
-  auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings");
+  auto nvtx = m.def_submodule("_nvtx", "libNvToolsExt.so bindings");
   nvtx.def("rangePushA", nvtxRangePushA);
   nvtx.def("rangePop", nvtxRangePop);
   nvtx.def("rangeStartA", nvtxRangeStartA);
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index 2ea4a975ba851..7c8c13034df33 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -1,6 +1,6 @@
 #include <sstream>
 
-#include <nvtx3/nvToolsExt.h>
+#include <nvToolsExt.h>
 
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/ApproximateClock.h>
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index b363097be3ca6..fa727a7c078c8 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -647,7 +647,7 @@
         ("cub/device/device_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/device/device_select.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
-        ("nvtx3/nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
+        ("nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
         ("nvml.h", ("rocm_smi/rocm_smi.h", CONV_INCLUDE, API_ROCMSMI)),
     ]
 )