Skip to content

Commit

Permalink
remove NCCL capability from pycuda, use cupy engines instead (#524)
Browse files Browse the repository at this point in the history
  • Loading branch information
daurer authored Feb 2, 2024
1 parent be94818 commit 2fa9f5a
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 73 deletions.
72 changes: 3 additions & 69 deletions ptypy/accelerate/cuda_pycuda/multi_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used.
It should be in DEFAULT mode.
5) NCCL support has been dropped from PyCUDA module, but can be used with CuPy module instead
"""

from pkg_resources import parse_version
Expand All @@ -35,26 +37,13 @@
from ptypy.utils.verbose import logger, log
import os

try:
from cupy.cuda import nccl
import cupy as cp
except ImportError:
nccl = None

try:
import mpi4py
except ImportError:
mpi4py = None

# properties to check which versions are available

# use NCCL is it is available, and the user didn't override the
# default selection with environment variables
have_nccl = (nccl is not None) and \
(not 'PTYPY_USE_CUDAMPI' in os.environ) and \
(not 'PTYPY_USE_MPI' in os.environ) and \
('PTYPY_USE_NCCL' in os.environ)

# At the moment, we require:
# the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true,
# mpi4py >= 3.1.0
Expand Down Expand Up @@ -109,64 +98,9 @@ def allReduceSum(self, arr):
comm = parallel.comm
comm.Allreduce(parallel.MPI.IN_PLACE, arr)


class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase):

def __init__(self):
super().__init__()

# Check if GPUs are in default mode
if cuda.Context.get_device().get_attributes()[cuda.device_attribute.COMPUTE_MODE] != cuda.compute_mode.DEFAULT:
raise RuntimeError("Compute mode must be default in order to use NCCL")

# get a unique identifier for the NCCL communicator and
# broadcast it to all MPI processes (assuming one device per process)
if self.rank == 0:
self.id = nccl.get_unique_id()
else:
self.id = None

self.id = parallel.bcast(self.id)

self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank)

def allReduceSum(self, arr):
"""Call MPI.all_reduce in-place, with array on GPU"""

buf = int(arr.gpudata)
count, datatype = self.__get_NCCL_count_dtype(arr)

# no stream support here for now - it fails in NCCL when
# pycuda.Stream.handle is used for some unexplained reason
stream = cp.cuda.Stream.null.ptr

self.com.allReduce(buf, buf, count, datatype, nccl.NCCL_SUM, stream)

def __get_NCCL_count_dtype(self, arr):
if arr.dtype == np.complex64:
return arr.size*2, nccl.NCCL_FLOAT32
elif arr.dtype == np.complex128:
return arr.size*2, nccl.NCCL_FLOAT64
elif arr.dtype == np.float32:
return arr.size, nccl.NCCL_FLOAT32
elif arr.dtype == np.float64:
return arr.size, nccl.NCCL_FLOAT64
else:
raise ValueError("This dtype is not supported by NCCL.")


# pick the appropriate communicator depending on installed packages
def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True):
if have_nccl and use_nccl:
try:
comm = MultiGpuCommunicatorNccl()
log(4, "Using NCCL communicator")
return comm
except RuntimeError:
pass
except AttributeError:
# see issue #323
pass
def get_multi_gpu_communicator(use_cuda_mpi=True):
if have_cuda_mpi and use_cuda_mpi:
try:
comm = MultiGpuCommunicatorCudaMpi()
Expand Down
4 changes: 0 additions & 4 deletions test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,3 @@ def test_multigpu_mpi(self):
@unittest.skipIf(not mgpu.have_cuda_mpi, "Cuda-aware MPI not available")
def test_multigpu_cudampi(self):
self.multigpu_tester(mgpu.MultiGpuCommunicatorCudaMpi())

@unittest.skipIf(not mgpu.have_nccl, "NCCL not available")
def test_multigpu_nccl(self):
self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())

0 comments on commit 2fa9f5a

Please sign in to comment.