ptycho · daurer · Feb 2, 2024 · Jan 30, 2024
diff --git a/ptypy/accelerate/cuda_pycuda/multi_gpu.py b/ptypy/accelerate/cuda_pycuda/multi_gpu.py
@@ -25,6 +25,8 @@
 4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. 
    It should be in DEFAULT mode.
 
+5) NCCL support has been dropped from PyCUDA module, but can be used with CuPy module instead
+
 """
 
 from pkg_resources import parse_version
@@ -35,26 +37,13 @@
 from ptypy.utils.verbose import logger, log
 import os
 
-try:
-    from cupy.cuda import nccl
-    import cupy as cp
-except ImportError:
-    nccl = None
-
 try:
     import mpi4py
 except ImportError:
     mpi4py = None
 
 # properties to check which versions are available
 
-# use NCCL is it is available, and the user didn't override the
-# default selection with environment variables
-have_nccl = (nccl is not None) and \
-    (not 'PTYPY_USE_CUDAMPI' in os.environ) and \
-    (not 'PTYPY_USE_MPI' in os.environ) and \
-    ('PTYPY_USE_NCCL' in os.environ)
-
 # At the moment, we require:
 # the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true,
 # mpi4py >= 3.1.0
@@ -109,64 +98,9 @@ def allReduceSum(self, arr):
             comm = parallel.comm
             comm.Allreduce(parallel.MPI.IN_PLACE, arr)
 
-
-class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase):
-
-    def __init__(self):
-        super().__init__()
-
-        # Check if GPUs are in default mode        
-        if cuda.Context.get_device().get_attributes()[cuda.device_attribute.COMPUTE_MODE] != cuda.compute_mode.DEFAULT:
-            raise RuntimeError("Compute mode must be default in order to use NCCL")
-
-        # get a unique identifier for the NCCL communicator and 
-        # broadcast it to all MPI processes (assuming one device per process)
-        if self.rank == 0:
-            self.id = nccl.get_unique_id()
-        else:
-            self.id = None
-
-        self.id = parallel.bcast(self.id)
-
-        self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank)
-
-    def allReduceSum(self, arr):
-        """Call MPI.all_reduce in-place, with array on GPU"""
-
-        buf = int(arr.gpudata)
-        count, datatype = self.__get_NCCL_count_dtype(arr)
-
-        # no stream support here for now - it fails in NCCL when 
-        # pycuda.Stream.handle is used for some unexplained reason
-        stream = cp.cuda.Stream.null.ptr
-
-        self.com.allReduce(buf, buf, count, datatype, nccl.NCCL_SUM, stream)
-
-    def __get_NCCL_count_dtype(self, arr):
-            if arr.dtype == np.complex64:
-                return arr.size*2, nccl.NCCL_FLOAT32
-            elif arr.dtype == np.complex128:
-                return arr.size*2, nccl.NCCL_FLOAT64
-            elif arr.dtype == np.float32:
-                return arr.size, nccl.NCCL_FLOAT32
-            elif arr.dtype == np.float64:
-                return arr.size, nccl.NCCL_FLOAT64
-            else:
-                raise ValueError("This dtype is not supported by NCCL.")
-
 
 # pick the appropriate communicator depending on installed packages
-def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True):
-    if have_nccl and use_nccl:
-        try:
-            comm = MultiGpuCommunicatorNccl()
-            log(4, "Using NCCL communicator")
-            return comm
-        except RuntimeError:
-            pass
-        except AttributeError:
-            # see issue #323
-            pass
+def get_multi_gpu_communicator(use_cuda_mpi=True):
     if have_cuda_mpi and use_cuda_mpi:
         try:
             comm = MultiGpuCommunicatorCudaMpi()

diff --git a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
@@ -85,7 +85,3 @@ def test_multigpu_mpi(self):
     @unittest.skipIf(not mgpu.have_cuda_mpi, "Cuda-aware MPI not available")
     def test_multigpu_cudampi(self):
         self.multigpu_tester(mgpu.MultiGpuCommunicatorCudaMpi())
-
-    @unittest.skipIf(not mgpu.have_nccl, "NCCL not available")
-    def test_multigpu_nccl(self):
-        self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())