Remove cuda-python from project (#245)

Remove cuda-python and use CuPy APIs instead --------- Co-authored-by: Changho Hwang <[email protected]>
microsoft · Feb 13, 2024 · 5971508 · 5971508
1 parent d97fef4
commit 5971508
Show file tree

Hide file tree

Showing 7 changed files with 71 additions and 76 deletions.
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
@@ -8,7 +8,6 @@
 import tempfile
 from typing import Any, Type
 
-from cuda import cuda, nvrtc, cudart
 import cupy as cp
 import numpy as np
 
@@ -22,62 +21,42 @@
     torchTensor = Type[Any]
 
 
-def _check_cuda_errors(result):
-    if result[0].value:
-        raise RuntimeError(f"CUDA error code={result[0].value}({_cuda_get_error(result[0])})")
-    if len(result) == 1:
-        return None
-    elif len(result) == 2:
-        return result[1]
-    else:
-        return result[1:]
-
-
-def _cuda_get_error(error):
-    if isinstance(error, cuda.CUresult):
-        err, name = cuda.cuGetErrorName(error)
-        return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
-    elif isinstance(error, cudart.cudaError_t):
-        return cudart.cudaGetErrorName(error)[1]
-    elif isinstance(error, nvrtc.nvrtcResult):
-        return nvrtc.nvrtcGetErrorString(error)[1]
-    else:
-        raise RuntimeError("Unknown error type: {}".format(error))
-
-
 class Kernel:
-    def __init__(self, ptx: bytes, kernel_name: str, device_id: int):
-        self._context = _check_cuda_errors(cuda.cuCtxGetCurrent())
-        assert self._context is not None
-        self._module = _check_cuda_errors(cuda.cuModuleLoadData(ptx))
-        self._kernel = _check_cuda_errors(cuda.cuModuleGetFunction(self._module, kernel_name.encode()))
+    CU_LAUNCH_PARAM_BUFFER_POINTER = 0x01
+    CU_LAUNCH_PARAM_BUFFER_SIZE = 0x02
+    CU_LAUNCH_PARAM_END = 0x00 if not cp.cuda.runtime.is_hip else 0x03
+
+    def __init__(self, ptx: bytes, kernel_name: str):
+        self._module = cp.cuda.driver.moduleLoadData(ptx)
+        self._kernel = cp.cuda.driver.moduleGetFunction(self._module, kernel_name)
 
     def launch_kernel(
         self,
         params: bytes,
         nblocks: int,
         nthreads: int,
         shared: int,
-        stream: Type[cuda.CUstream] or Type[cudart.cudaStream_t],
+        stream: Type[cp.cuda.Stream] or Type[None],
     ):
         buffer = (ctypes.c_byte * len(params)).from_buffer_copy(params)
         buffer_size = ctypes.c_size_t(len(params))
         config = np.array(
             [
-                cuda.CU_LAUNCH_PARAM_BUFFER_POINTER,
+                Kernel.CU_LAUNCH_PARAM_BUFFER_POINTER,
                 ctypes.addressof(buffer),
-                cuda.CU_LAUNCH_PARAM_BUFFER_SIZE,
+                Kernel.CU_LAUNCH_PARAM_BUFFER_SIZE,
                 ctypes.addressof(buffer_size),
-                cuda.CU_LAUNCH_PARAM_END,
+                Kernel.CU_LAUNCH_PARAM_END,
             ],
             dtype=np.uint64,
         )
-        _check_cuda_errors(
-            cuda.cuLaunchKernel(self._kernel, nblocks, 1, 1, nthreads, 1, 1, shared, stream, 0, config.ctypes.data)
+        cuda_stream = stream.ptr if stream else 0
+        cp.cuda.driver.launchKernel(
+            self._kernel, nblocks, 1, 1, nthreads, 1, 1, shared, cuda_stream, 0, config.ctypes.data
         )
 
     def __del__(self):
-        cuda.cuModuleUnload(self._module)
+        cp.cuda.driver.moduleUnload(self._module)
 
 
 class KernelBuilder:
@@ -96,35 +75,48 @@ def __init__(self, file: str, kernel_name: str, file_dir: str = None, macro_dict
         self.macros = None
         if file_dir:
             self.macros = ["-D{}={}".format(macro, value) for macro, value in macro_dict.items()]
-        device_id = cp.cuda.Device().id
-        ptx = self._compile_cuda(os.path.join(self._current_file_dir, file), f"{kernel_name}.ptx", device_id)
-        self._kernel = Kernel(ptx, kernel_name, device_id)
+        ptx = self._compile_cuda(os.path.join(self._current_file_dir, file), f"{kernel_name}.ptx")
+        self._kernel = Kernel(ptx, kernel_name)
         self.kernel_map[kernel_key] = self._kernel
 
-    def _compile_cuda(self, source_file, output_file, device_id, std_version="c++17"):
+    def _compile_cuda(self, source_file, output_file, std_version="c++17"):
         mscclpp_home = os.environ.get("MSCCLPP_HOME", "/usr/local/mscclpp")
         include_dir = os.path.join(mscclpp_home, "include")
-        major = _check_cuda_errors(
-            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device_id)
-        )
-        minor = _check_cuda_errors(
-            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device_id)
-        )
-        cuda_home = os.environ.get("CUDA_HOME")
-        nvcc = os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
-        command = [
-            nvcc,
-            f"-std={std_version}",
-            "-ptx",
-            "-Xcompiler",
-            "-Wall,-Wextra",
-            f"-I{include_dir}",
-            f"{source_file}",
-            f"--gpu-architecture=compute_{major}{minor}",
-            f"--gpu-code=sm_{major}{minor},compute_{major}{minor}",
-            "-o",
-            f"{self._tempdir.name}/{output_file}",
-        ]
+        if not cp.cuda.runtime.is_hip:
+            compute_capability = cp.cuda.Device().compute_capability
+            cuda_home = os.environ.get("CUDA_HOME")
+            nvcc = os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
+            command = [
+                nvcc,
+                f"-std={std_version}",
+                "-ptx",
+                "-Xcompiler",
+                "-Wall,-Wextra",
+                f"-I{include_dir}",
+                f"{source_file}",
+                f"--gpu-architecture=compute_{compute_capability}",
+                f"--gpu-code=sm_{compute_capability},compute_{compute_capability}",
+                "-o",
+                f"{self._tempdir.name}/{output_file}",
+            ]
+        else:
+            # the gcn arch name is like "gfx942:sramecc+:xnack-"
+            gcn_arch = (
+                cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)["gcnArchName"].decode("utf-8").split(":")[0]
+            )
+            rocm_home = os.environ.get("ROCM_HOME")
+            hipcc = os.path.join(rocm_home, "bin/hipcc") if rocm_home else "hipcc"
+            command = [
+                hipcc,
+                f"-std={std_version}",
+                "--genco",
+                "-D__HIP_PLATFORM_AMD__",
+                f"--offload-arch={gcn_arch}",
+                f"-I{include_dir}",
+                f"{source_file}",
+                "-o",
+                f"{self._tempdir.name}/{output_file}",
+            ]
         if self.macros:
             command += self.macros
         try:

diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu
@@ -1,7 +1,11 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#if defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_fp16.h>
+#else
 #include <cuda_fp16.h>
+#endif
 
 #include <mscclpp/concurrency_device.hpp>
 #include <mscclpp/nvls_device.hpp>

diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
@@ -84,7 +84,7 @@ def check_correctness(memory, func, niter=100):
     for p in range(niter):
         memory[:] = cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + MPI.COMM_WORLD.rank)
         cp.cuda.runtime.deviceSynchronize()
-        output_memory = func(0)
+        output_memory = func(None)
         cp.cuda.runtime.deviceSynchronize()
         expected = cp.zeros_like(memory)
         for i in range(MPI.COMM_WORLD.size):
@@ -110,7 +110,7 @@ def bench_time(niter: int, func):
     with stream:
         stream.begin_capture()
         for i in range(niter):
-            func(stream.ptr)
+            func(stream)
         graph = stream.end_capture()
 
     # now run a warm up round

diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py
@@ -66,8 +66,8 @@ def __init__(
 
         self.set_params(nblocks, block_size, read_only)
 
-    def __call__(self, stream_ptr):
-        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
         return self.memory
 
     def set_params(self, nblocks, block_size, read_only):
@@ -131,8 +131,8 @@ def __init__(
 
         self.set_params(nblocks, block_size)
 
-    def __call__(self, stream_ptr):
-        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
         return self.memory_out
 
     def set_params(self, nblocks, block_size):
@@ -201,8 +201,8 @@ def __init__(
 
         self.set_params(nblocks, block_size)
 
-    def __call__(self, stream_ptr):
-        self.kernel.launch_kernel(self.params, 24, 1024, 0, stream_ptr)
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, 24, 1024, 0, stream)
         return self.memory
 
     def set_params(self, nblocks, block_size):
@@ -295,8 +295,8 @@ def __init__(
 
         self.set_params(nblocks, block_size, pipeline_depth)
 
-    def __call__(self, stream_ptr):
-        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
         return self.memory
 
     def set_params(self, nblocks, block_size, pipeline_depth):
@@ -388,8 +388,8 @@ def __init__(
 
         self.set_params(nblocks, block_size)
 
-    def __call__(self, stream_ptr):
-        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
         return self.memory_out
 
     def set_params(self, nblocks, block_size):

diff --git a/python/mscclpp_benchmark/nccl_op.py b/python/mscclpp_benchmark/nccl_op.py
@@ -16,7 +16,8 @@ def __init__(self, nccl_comm: nccl.NcclCommunicator, memory: cp.ndarray):
         else:
             raise RuntimeError("Make sure that the data type is mapped to the correct NCCL data type")
 
-    def __call__(self, stream_ptr):
+    def __call__(self, stream):
+        stream_ptr = stream.ptr if stream else 0
         self.nccl_comm.allReduce(
             self.memory.data.ptr, self.memory.data.ptr, self.memory.size, self.nccl_dtype, nccl.NCCL_SUM, stream_ptr
         )

diff --git a/python/requirements_cu11.txt b/python/requirements_cu11.txt
@@ -1,7 +1,6 @@
 mpi4py
 cupy-cuda11x
 prettytable
-cuda-python
 netifaces
 pytest
 numpy

diff --git a/python/requirements_cu12.txt b/python/requirements_cu12.txt
@@ -1,7 +1,6 @@
 mpi4py
 cupy-cuda12x
 prettytable
-cuda-python
 netifaces
 pytest
 numpy