From 3f3b962dd53e437579e19ffda2d9dcc89c1b5d39 Mon Sep 17 00:00:00 2001 From: David Trudgian Date: Mon, 20 Dec 2021 16:58:10 -0600 Subject: [PATCH 1/2] fix: gpu: ensure MIGs available with --nvccli and no --contain When using nvidia-container-cli to setup the GPUs in the container, we need to force NVIDIA_VISIBLE_DEVICES=all to have an exact match to the legacy GPU binding behaviour, where MIGs are available as well as physical GPUs. Fixes #471 --- CHANGELOG.md | 7 +++++++ cmd/internal/cli/actions_linux.go | 17 ++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c684c8dcb8..5218e32413 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # SingularityCE Changelog +## Changes Since Last Release + +### Bug fixes + +- Ensure MIGs are visible with `--nvccli` in non-contained mode, to match the + legacy GPU binding behaviour. + ## v3.9.2 \[2021-12-10\] ### Bug fixes diff --git a/cmd/internal/cli/actions_linux.go b/cmd/internal/cli/actions_linux.go index 03cedf9e3a..d9ff5177d8 100644 --- a/cmd/internal/cli/actions_linux.go +++ b/cmd/internal/cli/actions_linux.go @@ -800,11 +800,18 @@ func setNvCCLIConfig(engineConfig *singularityConfig.EngineConfig) (err error) { sylog.Debugf("Using nvidia-container-cli for GPU setup") engineConfig.SetNvCCLI(true) - // When we use --contain we don't mount the NV devices by default in the nvidia-container-cli flow, - // they must be mounted via specifying with`NVIDIA_VISIBLE_DEVICES`. This differs from the legacy - // flow which mounts all GPU devices, always. - if (IsContained || IsContainAll) && os.Getenv("NVIDIA_VISIBLE_DEVICES") == "" { - sylog.Warningf("When using nvidia-container-cli with --contain NVIDIA_VISIBLE_DEVICES must be set or no GPUs will be available in container.") + if os.Getenv("NVIDIA_VISIBLE_DEVICES") == "" { + if IsContained || IsContainAll { + // When we use --contain we don't mount the NV devices by default in the nvidia-container-cli flow, + // they must be mounted via specifying with`NVIDIA_VISIBLE_DEVICES`. This differs from the legacy + // flow which mounts all GPU devices, always... so warn the user. + sylog.Warningf("When using nvidia-container-cli with --contain NVIDIA_VISIBLE_DEVICES must be set or no GPUs will be available in container.") + } else { + // In non-contained mode set NVIDIA_VISIBLE_DEVICES="all" by default, so MIGs are available. + // Otherwise there is a difference vs legacy GPU binding. See Issue #471. + sylog.Infof("Setting 'NVIDIA_VISIBLE_DEVICES=all' to emulate legacy GPU binding.") + os.Setenv("NVIDIA_VISIBLE_DEVICES", "all") + } } // Pass NVIDIA_ env vars that will be converted to nvidia-container-cli options From e0ce9e8be0ef6d3181e5aea487ae55b3ce682fb5 Mon Sep 17 00:00:00 2001 From: David Trudgian Date: Wed, 29 Dec 2021 08:44:06 -0600 Subject: [PATCH 2/2] Add missing changelog entry for #484 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5218e32413..ca8ca48340 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - Ensure MIGs are visible with `--nvccli` in non-contained mode, to match the legacy GPU binding behaviour. +- Avoid fd leak in loop device transient error path. ## v3.9.2 \[2021-12-10\]