From 2fa9f5a7868c83a0de0f70e23e95b365cb375bee Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Fri, 2 Feb 2024 16:10:28 +0000 Subject: [PATCH] remove NCCL capability from pycuda, use cupy engines instead (#524) --- ptypy/accelerate/cuda_pycuda/multi_gpu.py | 72 +------------------ .../cuda_pycuda_tests/multi_gpu_test.py | 4 -- 2 files changed, 3 insertions(+), 73 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/multi_gpu.py b/ptypy/accelerate/cuda_pycuda/multi_gpu.py index 33113c273..73138c3ee 100644 --- a/ptypy/accelerate/cuda_pycuda/multi_gpu.py +++ b/ptypy/accelerate/cuda_pycuda/multi_gpu.py @@ -25,6 +25,8 @@ 4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. It should be in DEFAULT mode. +5) NCCL support has been dropped from PyCUDA module, but can be used with CuPy module instead + """ from pkg_resources import parse_version @@ -35,12 +37,6 @@ from ptypy.utils.verbose import logger, log import os -try: - from cupy.cuda import nccl - import cupy as cp -except ImportError: - nccl = None - try: import mpi4py except ImportError: @@ -48,13 +44,6 @@ # properties to check which versions are available -# use NCCL is it is available, and the user didn't override the -# default selection with environment variables -have_nccl = (nccl is not None) and \ - (not 'PTYPY_USE_CUDAMPI' in os.environ) and \ - (not 'PTYPY_USE_MPI' in os.environ) and \ - ('PTYPY_USE_NCCL' in os.environ) - # At the moment, we require: # the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true, # mpi4py >= 3.1.0 @@ -109,64 +98,9 @@ def allReduceSum(self, arr): comm = parallel.comm comm.Allreduce(parallel.MPI.IN_PLACE, arr) - -class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase): - - def __init__(self): - super().__init__() - - # Check if GPUs are in default mode - if cuda.Context.get_device().get_attributes()[cuda.device_attribute.COMPUTE_MODE] != cuda.compute_mode.DEFAULT: - raise RuntimeError("Compute mode must be default in order to use NCCL") - - # get a unique identifier for the NCCL communicator and - # broadcast it to all MPI processes (assuming one device per process) - if self.rank == 0: - self.id = nccl.get_unique_id() - else: - self.id = None - - self.id = parallel.bcast(self.id) - - self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank) - - def allReduceSum(self, arr): - """Call MPI.all_reduce in-place, with array on GPU""" - - buf = int(arr.gpudata) - count, datatype = self.__get_NCCL_count_dtype(arr) - - # no stream support here for now - it fails in NCCL when - # pycuda.Stream.handle is used for some unexplained reason - stream = cp.cuda.Stream.null.ptr - - self.com.allReduce(buf, buf, count, datatype, nccl.NCCL_SUM, stream) - - def __get_NCCL_count_dtype(self, arr): - if arr.dtype == np.complex64: - return arr.size*2, nccl.NCCL_FLOAT32 - elif arr.dtype == np.complex128: - return arr.size*2, nccl.NCCL_FLOAT64 - elif arr.dtype == np.float32: - return arr.size, nccl.NCCL_FLOAT32 - elif arr.dtype == np.float64: - return arr.size, nccl.NCCL_FLOAT64 - else: - raise ValueError("This dtype is not supported by NCCL.") - # pick the appropriate communicator depending on installed packages -def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True): - if have_nccl and use_nccl: - try: - comm = MultiGpuCommunicatorNccl() - log(4, "Using NCCL communicator") - return comm - except RuntimeError: - pass - except AttributeError: - # see issue #323 - pass +def get_multi_gpu_communicator(use_cuda_mpi=True): if have_cuda_mpi and use_cuda_mpi: try: comm = MultiGpuCommunicatorCudaMpi() diff --git a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py index fdc34a528..be96aed54 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py @@ -85,7 +85,3 @@ def test_multigpu_mpi(self): @unittest.skipIf(not mgpu.have_cuda_mpi, "Cuda-aware MPI not available") def test_multigpu_cudampi(self): self.multigpu_tester(mgpu.MultiGpuCommunicatorCudaMpi()) - - @unittest.skipIf(not mgpu.have_nccl, "NCCL not available") - def test_multigpu_nccl(self): - self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())