Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove NCCL capability from pycuda, use cupy engines instead #524

Merged
merged 1 commit into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 3 additions & 69 deletions ptypy/accelerate/cuda_pycuda/multi_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used.
It should be in DEFAULT mode.

5) NCCL support has been dropped from PyCUDA module, but can be used with CuPy module instead

"""

from pkg_resources import parse_version
Expand All @@ -35,26 +37,13 @@
from ptypy.utils.verbose import logger, log
import os

try:
from cupy.cuda import nccl
import cupy as cp
except ImportError:
nccl = None

try:
import mpi4py
except ImportError:
mpi4py = None

# properties to check which versions are available

# use NCCL is it is available, and the user didn't override the
# default selection with environment variables
have_nccl = (nccl is not None) and \
(not 'PTYPY_USE_CUDAMPI' in os.environ) and \
(not 'PTYPY_USE_MPI' in os.environ) and \
('PTYPY_USE_NCCL' in os.environ)

# At the moment, we require:
# the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true,
# mpi4py >= 3.1.0
Expand Down Expand Up @@ -109,64 +98,9 @@ def allReduceSum(self, arr):
comm = parallel.comm
comm.Allreduce(parallel.MPI.IN_PLACE, arr)


class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase):

def __init__(self):
super().__init__()

# Check if GPUs are in default mode
if cuda.Context.get_device().get_attributes()[cuda.device_attribute.COMPUTE_MODE] != cuda.compute_mode.DEFAULT:
raise RuntimeError("Compute mode must be default in order to use NCCL")

# get a unique identifier for the NCCL communicator and
# broadcast it to all MPI processes (assuming one device per process)
if self.rank == 0:
self.id = nccl.get_unique_id()
else:
self.id = None

self.id = parallel.bcast(self.id)

self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank)

def allReduceSum(self, arr):
"""Call MPI.all_reduce in-place, with array on GPU"""

buf = int(arr.gpudata)
count, datatype = self.__get_NCCL_count_dtype(arr)

# no stream support here for now - it fails in NCCL when
# pycuda.Stream.handle is used for some unexplained reason
stream = cp.cuda.Stream.null.ptr

self.com.allReduce(buf, buf, count, datatype, nccl.NCCL_SUM, stream)

def __get_NCCL_count_dtype(self, arr):
if arr.dtype == np.complex64:
return arr.size*2, nccl.NCCL_FLOAT32
elif arr.dtype == np.complex128:
return arr.size*2, nccl.NCCL_FLOAT64
elif arr.dtype == np.float32:
return arr.size, nccl.NCCL_FLOAT32
elif arr.dtype == np.float64:
return arr.size, nccl.NCCL_FLOAT64
else:
raise ValueError("This dtype is not supported by NCCL.")


# pick the appropriate communicator depending on installed packages
def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True):
if have_nccl and use_nccl:
try:
comm = MultiGpuCommunicatorNccl()
log(4, "Using NCCL communicator")
return comm
except RuntimeError:
pass
except AttributeError:
# see issue #323
pass
def get_multi_gpu_communicator(use_cuda_mpi=True):
if have_cuda_mpi and use_cuda_mpi:
try:
comm = MultiGpuCommunicatorCudaMpi()
Expand Down
4 changes: 0 additions & 4 deletions test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,3 @@ def test_multigpu_mpi(self):
@unittest.skipIf(not mgpu.have_cuda_mpi, "Cuda-aware MPI not available")
def test_multigpu_cudampi(self):
self.multigpu_tester(mgpu.MultiGpuCommunicatorCudaMpi())

@unittest.skipIf(not mgpu.have_nccl, "NCCL not available")
def test_multigpu_nccl(self):
self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())
Loading