From a9d8b906aa68c3926b269e143679003c2422a2e1 Mon Sep 17 00:00:00 2001 From: Gyuho Lee <6799218+gyuho@users.noreply.github.com> Date: Sun, 27 Oct 2024 13:47:28 +0800 Subject: [PATCH] fix(nvidia): persistence mode check based on NVML, do not rely on "nvidia-persistenced" binary (#137) Even if "nvidia-persistenced" is not running, GPU can have persistence mode enabled via NVML, thus we should not rely on nvidia-persistenced daemon running to decide whether the persistence mode is enabled or not. Signed-off-by: Gyuho Lee --- .../persistence-mode/component_output.go | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/components/accelerator/nvidia/persistence-mode/component_output.go b/components/accelerator/nvidia/persistence-mode/component_output.go index be83d149..744e3e33 100644 --- a/components/accelerator/nvidia/persistence-mode/component_output.go +++ b/components/accelerator/nvidia/persistence-mode/component_output.go @@ -91,16 +91,8 @@ func ParseStatesToOutput(states ...components.State) (*Output, error) { // Returns the output evaluation reason and its healthy-ness. func (o *Output) Evaluate() (string, bool, error) { reasons := []string{} - healthy := true - if !o.PersistencedExists { - reasons = append(reasons, "nvidia-persistenced does not exist (install 'nvidia-persistenced' or run 'nvidia-smi -pm 1')") - healthy = false - } - if !o.PersistencedRunning { - reasons = append(reasons, "nvidia-persistenced exists but not running (start 'nvidia-persistenced' or run 'nvidia-smi -pm 1')") - healthy = false - } + enabled := true for _, p := range o.PersistenceModesSMI { if o.PersistencedRunning { continue @@ -111,7 +103,7 @@ func (o *Output) Evaluate() (string, bool, error) { // we cannot guarantee that the NVIDIA Persistence Daemon will be running. This would be a feature regression as persistence mode might not be available out-of- the-box." if !p.Enabled { reasons = append(reasons, fmt.Sprintf("persistence mode is not enabled on %s (nvidia-smi)", p.ID)) - healthy = false + enabled = false } } @@ -125,11 +117,20 @@ func (o *Output) Evaluate() (string, bool, error) { // we cannot guarantee that the NVIDIA Persistence Daemon will be running. This would be a feature regression as persistence mode might not be available out-of- the-box." if !p.Enabled { reasons = append(reasons, fmt.Sprintf("persistence mode is not enabled on %s (NVML)", p.UUID)) - healthy = false + enabled = false } } - return strings.Join(reasons, "; "), healthy, nil + // does not make the component unhealthy, since persistence mode can still be enabled + // recommend installing nvidia-persistenced since it's the recommended way to enable persistence mode + if !o.PersistencedExists { + reasons = append(reasons, "nvidia-persistenced does not exist (install 'nvidia-persistenced' or run 'nvidia-smi -pm 1')") + } + if !o.PersistencedRunning { + reasons = append(reasons, "nvidia-persistenced exists but not running (start 'nvidia-persistenced' or run 'nvidia-smi -pm 1')") + } + + return strings.Join(reasons, "; "), enabled, nil } func (o *Output) States() ([]components.State, error) {