Skip to content

Commit

Permalink
feat patch unhealthy gpu list to node annotations
Browse files Browse the repository at this point in the history
Signed-off-by: yongjiahe <[email protected]>
  • Loading branch information
yongjiahe committed May 13, 2022
1 parent b8725b7 commit 8be99d4
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 1 deletion.
3 changes: 3 additions & 0 deletions pkg/plugin/nvidia/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ const (
GPUAssigned = "volcano.sh/gpu-assigned"
GPUIndex = "volcano.sh/gpu-index"

// UnhealthyGPUIDs list of unhealthy gpu ids
UnhealthyGPUIDs = "volcano.sh/gpu-unhealthy-ids"

// Container env
// Allocated gpu memory
AllocatedGPUResource = "VOLCANO_GPU_ALLOCATED"
Expand Down
41 changes: 41 additions & 0 deletions pkg/plugin/nvidia/kube_interactor.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"os"
"strings"
"time"

v1 "k8s.io/api/core/v1"
Expand All @@ -31,6 +32,7 @@ import (
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
nodeutil "k8s.io/kubernetes/pkg/util/node"
)

Expand Down Expand Up @@ -114,3 +116,42 @@ func (ki *KubeInteractor) PatchGPUResourceOnNode(gpuCount int) error {
})
return err
}

func (ki *KubeInteractor) PatchUnhealthyGPUListOnNode(devices []*Device) error {
var err error
unhealthyGPUsStr := ""
unhealthyGPUs := []string{}

for i := range devices {
if devices[i].Health == pluginapi.Unhealthy {
unhealthyGPUs = append(unhealthyGPUs, fmt.Sprintf("%d", devices[i].Index))
}
}

if len(unhealthyGPUs) > 0 {
unhealthyGPUsStr = strings.Join(unhealthyGPUs, ",")
}

err = wait.PollImmediate(1*time.Second, 10*time.Second, func() (bool, error) {
var node *v1.Node
node, err = ki.clientset.CoreV1().Nodes().Get(context.TODO(), ki.nodeName, metav1.GetOptions{})
if err != nil {
klog.Info("failed to get node %s: %v", ki.nodeName, err)
return false, nil
}

newNode := node.DeepCopy()
if unhealthyGPUsStr != "" {
newNode.Annotations[UnhealthyGPUIDs] = unhealthyGPUsStr
} else {
delete(newNode.Annotations, UnhealthyGPUIDs)
}
_, _, err = nodeutil.PatchNodeStatus(ki.clientset.CoreV1(), types.NodeName(ki.nodeName), node, newNode)
if err != nil {
klog.Infof("failed to patch volcano unhealthy gpu list %s: %v", unhealthyGPUsStr, err)
return false, nil
}
return true, nil
})
return err
}
25 changes: 24 additions & 1 deletion pkg/plugin/nvidia/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@
package nvidia

import (
"fmt"
"log"
"os"
"strings"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"

"k8s.io/klog"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)

Expand All @@ -33,7 +35,8 @@ const (

type Device struct {
pluginapi.Device
Path string
Path string
Index uint
}

type ResourceManager interface {
Expand Down Expand Up @@ -76,6 +79,10 @@ func buildDevice(d *nvml.Device) *Device {
dev.ID = d.UUID
dev.Health = pluginapi.Healthy
dev.Path = d.Path

_, err := fmt.Sscanf(d.Path, "/dev/nvidia%d", &dev.Index)
check(err)

if d.CPUAffinity != nil {
dev.Topology = &pluginapi.TopologyInfo{
Nodes: []*pluginapi.NUMANode{
Expand Down Expand Up @@ -110,6 +117,12 @@ func checkHealth(stop <-chan struct{}, devices []*Device, unhealthy chan<- *Devi
check(err)
}

firstTime := true
ki, err := NewKubeInteractor()
if err != nil {
klog.Fatalf("cannot create kube interactor. %v", err)
}

for {
select {
case <-stop:
Expand All @@ -119,13 +132,23 @@ func checkHealth(stop <-chan struct{}, devices []*Device, unhealthy chan<- *Devi

e, err := nvml.WaitForEvent(eventSet, 5000)
if err != nil && e.Etype != nvml.XidCriticalError {
if firstTime {
// reset unhealthy gpu list if all devices healthy
ki.PatchUnhealthyGPUListOnNode(devices)
firstTime = false
}
continue
}

// FIXME: formalize the full list and document it.
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
// Application errors: the GPU should still be healthy
if e.Edata == 31 || e.Edata == 43 || e.Edata == 45 {
if firstTime {
// reset unhealthy gpu list if all devices healthy
ki.PatchUnhealthyGPUListOnNode(devices)
firstTime = false
}
continue
}

Expand Down
7 changes: 7 additions & 0 deletions pkg/plugin/nvidia/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,16 @@ func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.Device
return nil
case d := <-m.health:
// FIXME: there is no way to recover from the Unhealthy state.
isChange := false
if d.Health != pluginapi.Unhealthy {
isChange = true
}
d.Health = pluginapi.Unhealthy
log.Printf("'%s' device marked unhealthy: %s", m.resourceName, d.ID)
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.virtualDevices})
if isChange {
m.kubeInteractor.PatchUnhealthyGPUListOnNode(m.physicalDevices)
}
}
}
}
Expand Down

0 comments on commit 8be99d4

Please sign in to comment.