diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml index b1e2e491..713d251c 100644 --- a/conda/environments/all_cuda-114_arch-x86_64.yaml +++ b/conda/environments/all_cuda-114_arch-x86_64.yaml @@ -20,7 +20,7 @@ dependencies: - numpydoc>=1.1.0 - pandas>=1.3 - pre-commit -- pynvml>=11.0.0,<12.0.0a0 +- pynvml>=12.0.0,<13.0.0a0 - pytest - pytest-cov - python>=3.10,<3.13 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 3144dee4..518f86dd 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -20,7 +20,7 @@ dependencies: - numpydoc>=1.1.0 - pandas>=1.3 - pre-commit -- pynvml>=11.0.0,<12.0.0a0 +- pynvml>=12.0.0,<13.0.0a0 - pytest - pytest-cov - python>=3.10,<3.13 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 95bb356c..5cdcab0a 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -21,7 +21,7 @@ dependencies: - numpydoc>=1.1.0 - pandas>=1.3 - pre-commit -- pynvml>=11.0.0,<12.0.0a0 +- pynvml>=12.0.0,<13.0.0a0 - pytest - pytest-cov - python>=3.10,<3.13 diff --git a/dask_cuda/plugins.py b/dask_cuda/plugins.py index cd1928af..ef60e52e 100644 --- a/dask_cuda/plugins.py +++ b/dask_cuda/plugins.py @@ -1,4 +1,5 @@ import importlib +import logging import os from typing import Callable, Dict @@ -12,7 +13,15 @@ def __init__(self, cores): self.cores = cores def setup(self, worker=None): - os.sched_setaffinity(0, self.cores) + try: + os.sched_setaffinity(0, self.cores) + except Exception: + logger = logging.getLogger("distributed.worker") + logger.warning( + "Setting CPU affinity for GPU failed. Please refer to the following " + "link for troubleshooting information: " + "https://docs.rapids.ai/api/dask-cuda/nightly/troubleshooting/#setting-cpu-affinity-failure" # noqa: E501 + ) class CUDFSetup(WorkerPlugin): diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py index a0a77677..a5e78db2 100644 --- a/dask_cuda/tests/test_utils.py +++ b/dask_cuda/tests/test_utils.py @@ -1,6 +1,7 @@ import os from unittest.mock import patch +import pynvml import pytest from numba import cuda @@ -197,7 +198,6 @@ def test_get_ucx_config(enable_tcp_over_ucx, enable_infiniband, enable_nvlink): def test_parse_visible_devices(): - pynvml = pytest.importorskip("pynvml") pynvml.nvmlInit() indices = [] uuids = [] @@ -250,7 +250,6 @@ def test_parse_device_memory_limit(): def test_parse_visible_mig_devices(): - pynvml = pytest.importorskip("pynvml") pynvml.nvmlInit() for index in range(get_gpu_count()): handle = pynvml.nvmlDeviceGetHandleByIndex(index) diff --git a/dependencies.yaml b/dependencies.yaml index 08f5ce2f..7b4d9434 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -157,7 +157,7 @@ dependencies: - numba>=0.57 - numpy>=1.23,<3.0a0 - pandas>=1.3 - - pynvml>=11.0.0,<12.0.0a0 + - pynvml>=12.0.0,<13.0.0a0 - rapids-dask-dependency==25.2.*,>=0.0.0a0 - zict>=2.0.0 test_python: diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 3af5e08d..d83c5577 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -30,3 +30,60 @@ For the DGX Station A100, the display GPU is commonly the fourth in the PCI Bus >>> from dask_cuda import LocalCUDACluster >>> cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES=[0, 1, 2, 4]) + +Setting CPU Affinity Failure +---------------------------- + +Setting the proper CPU affinity for a Dask-CUDA worker is important to ensure optimal performance, particularly when +memory transfers to/from system memory is necessary. In Dask-CUDA this is an automatic feature that attempts to +determine the appropriate CPU affinity for each worker according to the GPU that worker is targeting. + +There are situations where setting the CPU affinity may fail, the more common case involves workload managers and job +schedulers used by large compute clusters, such as Slurm. + +Within a node with multiple physical CPU (i.e., multiple CPU sockets) and multiple GPUs, in such systems it is common +for GPUs to be directly connected to a specific physical CPU to balance resources. Consider for example a node with 4 +GPUs and 40 CPU cores, where the CPU cores are split between two physical CPUs, in this case GPUs 0 and 1 may be +connected to CPUs 0-19 and GPUs 2 and 3 may be connected to CPUs 20-39. In a setup like this, if the node is entirely +assigned to the Dask-CUDA job, most likely setting CPU affinity will succeed, however, it is still possible that the +job assigns the wrong CPUs 20-39 to GPUs 0 and 1, or CPUs 0-19 to GPUs 2 and 3, in this case setting the CPU affinity +will be impossible, since the correct CPU/GPU resources are not available to the job. When this happens, the best +Dask-CUDA can do is raise a warning that redirects you to this sections and not set any CPU affinity, letting the +operating system handle all transfers as it sees fit, even if they may follow a suboptimal path. + +If after following the instructions contained in this section, including consulting your cluster's manual and +administrators, please [file an issue under the Dask-CUDA repository](https://github.com/rapidsai/dask-cuda/issues), +including the output for all commands below, they must be executed from the allocated cluster job: + +- ``conda list``, if environment was installed with conda or uses a RAPIDS provided Docker image; +- ``pip list``, if environment was installed with pip; +- ``nvidia-smi``; +- ``nvidia-smi topo -m``; +- ``python print_affinity.py``, the code for ``print_affinity.py`` immediately follows. + +.. code-block:: python + + # print_affinity.py + import math + from multiprocessing import cpu_count + + import pynvml + + pynvml.nvmlInit() + for i in range(pynvml.nvmlDeviceGetCount()): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + cpu_affinity = pynvml.nvmlDeviceGetCpuAffinity(handle, math.ceil(cpu_count() / 64)) + print(f"GPU {i}: list(cpu_affinity)") + +Slurm +~~~~~ + +The more commonly observed cases of this issue have been reported on Slurm clusters. Common ways to resolve this +normally involve providing a specific subset of CPUs to the job with one of the following arguments: + +- `--cpus-per-task=N`: the number of CPUs the job will have allocated, you may need to ask for all CPUs to ensure + the GPUs have all CPUs relevant to them available; +- `--exclusive`: to ensure exclusive allocation of CPUs to the job. + +Unfortunately, providing exact solutions for all existing clust configurations is not possible, therefore make +make sure to consult your cluster's manual and administrator for detailed information and further troubleshooting. diff --git a/pyproject.toml b/pyproject.toml index cfe5397c..105f6a9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ "numba>=0.57", "numpy>=1.23,<3.0a0", "pandas>=1.3", - "pynvml>=11.0.0,<12.0.0a0", + "pynvml>=12.0.0,<13.0.0a0", "rapids-dask-dependency==25.2.*,>=0.0.0a0", "zict>=2.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.