From b0d5e8ab018230faaf6ee23915431a5c4c00bb85 Mon Sep 17 00:00:00 2001 From: silenceper Date: Thu, 27 Jul 2023 18:18:27 +0800 Subject: [PATCH 1/2] support numa --- pkg/device_plugin/device_plugin.go | 31 ++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/pkg/device_plugin/device_plugin.go b/pkg/device_plugin/device_plugin.go index 219eb8c5..2d05317d 100644 --- a/pkg/device_plugin/device_plugin.go +++ b/pkg/device_plugin/device_plugin.go @@ -35,6 +35,7 @@ import ( "os" "path/filepath" "regexp" + "strconv" "strings" klog "k8s.io/klog/v2" @@ -62,6 +63,9 @@ var vGpuMap map[string][]NvidiaGpuDevice // Key is the Nvidia GPU id and value is the list of associated vGPU ids var gpuVgpuMap map[string][]string +// deviceNumaMap is a map of device id to NUMA node id +var deviceNumaMap map[string]int + var basePath = "/sys/bus/pci/devices" var vGpuBasePath = "/sys/bus/mdev/devices" var pciIdsFilePath = "/usr/pci.ids" @@ -91,15 +95,25 @@ func createDevicePlugins() { log.Printf("Device Map %s", deviceMap) log.Println("vGPU Map ", vGpuMap) log.Println("GPU vGPU Map ", gpuVgpuMap) + log.Println("Device NUMA Map ", deviceNumaMap) //Iterate over deivceMap to create device plugin for each type of GPU on the host for k, v := range deviceMap { devs = nil for _, dev := range v { - devs = append(devs, &pluginapi.Device{ + device := &pluginapi.Device{ ID: dev, Health: pluginapi.Healthy, - }) + } + numa, found := deviceNumaMap[dev] + if found { + device.Topology = &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{{ID: int64(numa)}}, + } + } else { + log.Printf("Error: Could not find NUMA node for device id: %s", dev) + } + devs = append(devs, device) } deviceName := getDeviceName(k) if deviceName == "" { @@ -162,6 +176,7 @@ func startVgpuDevicePluginFunc(dp *GenericVGpuDevicePlugin) error { func createIommuDeviceMap() { iommuMap = make(map[string][]NvidiaGpuDevice) deviceMap = make(map[string][]string) + deviceNumaMap = make(map[string]int) //Walk directory to discover pci devices filepath.Walk(basePath, func(path string, info os.FileInfo, err error) error { if err != nil { @@ -206,6 +221,18 @@ func createIommuDeviceMap() { deviceMap[deviceID] = append(deviceMap[deviceID], iommuGroup) } iommuMap[iommuGroup] = append(iommuMap[iommuGroup], NvidiaGpuDevice{info.Name()}) + numaContent, err := ioutil.ReadFile(fmt.Sprintf("%s/%s/numa_node", basePath, info.Name())) + if err != nil { + log.Printf("Error reading NUMA node for device %s, err %+v", info.Name(), err) + return nil + } + numaInt, err := strconv.Atoi(strings.Trim(string(numaContent), " \n")) + if err != nil { + log.Printf("Error converting NUMA node for device %s, err %+v", info.Name(), err) + return nil + } + log.Printf("NUMA node for device %s is %d", info.Name(), numaInt) + deviceNumaMap[iommuGroup] = numaInt } } return nil From fcc6aef6ae11dd363af94d50a5cc55b40a3edef0 Mon Sep 17 00:00:00 2001 From: Timur Naurazbaev Date: Mon, 11 Dec 2023 18:30:14 +0100 Subject: [PATCH 2/2] tests added --- pkg/device_plugin/device_plugin.go | 28 +++++++---- pkg/device_plugin/device_plugin_test.go | 46 +++++++++++++++++++ .../generic_device_plugin_test.go | 1 + 3 files changed, 67 insertions(+), 8 deletions(-) diff --git a/pkg/device_plugin/device_plugin.go b/pkg/device_plugin/device_plugin.go index 2d05317d..ee0ec878 100644 --- a/pkg/device_plugin/device_plugin.go +++ b/pkg/device_plugin/device_plugin.go @@ -71,6 +71,7 @@ var vGpuBasePath = "/sys/bus/mdev/devices" var pciIdsFilePath = "/usr/pci.ids" var readLink = readLinkFunc var readIDFromFile = readIDFromFileFunc +var readNUMAnodeIDFromFile = readNUMAnodeIDFromFileFunc var startDevicePlugin = startDevicePluginFunc var readVgpuIDFromFile = readVgpuIDFromFileFunc var readGpuIDForVgpu = readGpuIDForVgpuFunc @@ -221,18 +222,15 @@ func createIommuDeviceMap() { deviceMap[deviceID] = append(deviceMap[deviceID], iommuGroup) } iommuMap[iommuGroup] = append(iommuMap[iommuGroup], NvidiaGpuDevice{info.Name()}) - numaContent, err := ioutil.ReadFile(fmt.Sprintf("%s/%s/numa_node", basePath, info.Name())) + numaID, err := readNUMAnodeIDFromFile(basePath, info.Name()) if err != nil { - log.Printf("Error reading NUMA node for device %s, err %+v", info.Name(), err) + log.Println("Could not get numa node id for device ", info.Name()) return nil } - numaInt, err := strconv.Atoi(strings.Trim(string(numaContent), " \n")) - if err != nil { - log.Printf("Error converting NUMA node for device %s, err %+v", info.Name(), err) - return nil + if numaID != nil { + log.Printf("NUMA node for device %s is %d", info.Name(), *numaID) + deviceNumaMap[iommuGroup] = *numaID } - log.Printf("NUMA node for device %s is %d", info.Name(), numaInt) - deviceNumaMap[iommuGroup] = numaInt } } return nil @@ -284,6 +282,20 @@ func readIDFromFileFunc(basePath string, deviceAddress string, property string) return id, nil } +func readNUMAnodeIDFromFileFunc(basePath string, deviceAddress string) (*int, error) { + numaContent, err := ioutil.ReadFile(filepath.Join(basePath, deviceAddress, "numa_node")) + if err != nil { + glog.Errorf("Could not read NUMA node id for device %s, err %s", deviceAddress, err) + return nil, err + } + numaID, err := strconv.Atoi(strings.Trim(string(numaContent), " \n")) + if err != nil { + glog.Errorf("Could not convert to int NUMA node id for device %s, err %s", deviceAddress, err) + return nil, err + } + return &numaID, nil +} + // Read a file link func readLinkFunc(basePath string, deviceAddress string, link string) (string, error) { path, err := os.Readlink(filepath.Join(basePath, deviceAddress, link)) diff --git a/pkg/device_plugin/device_plugin_test.go b/pkg/device_plugin/device_plugin_test.go index a55d40f4..93279bb5 100644 --- a/pkg/device_plugin/device_plugin_test.go +++ b/pkg/device_plugin/device_plugin_test.go @@ -97,6 +97,17 @@ func getFakeIDFromFileDevicePlugin(basePath string, deviceAddress string, link s return "", errors.New("Incorrect operation") } +func getFakeNUMAnodeIDFromDevicePlugin(basePath string, deviceAddress string) (*int, error) { + if deviceAddress == deviceAddress1 { + fakeNUMANodeID := 0 + return &fakeNUMANodeID, nil + } else if deviceAddress == deviceAddress2 { + fakeNUMANodeID := 1 + return &fakeNUMANodeID, nil + } + return nil, errors.New("Incorrect operation") +} + func fakeStartDevicePluginFunc(dp *GenericDevicePlugin) error { if dp.deviceName == deviceName { return errors.New("Incorrect operation") @@ -186,6 +197,40 @@ var _ = Describe("Device Plugin", func() { }) }) + Context("readNUMAnodeIDFromFileFunc() Tests", func() { + BeforeEach(func() { + workDir, err = ioutil.TempDir("", "kubevirt-test") + Expect(err).ToNot(HaveOccurred()) + os.Mkdir(workDir+"/1", 0755) + ioutil.WriteFile(filepath.Join(workDir, deviceAddress1, "numa_node"), []byte("0\n"), 0644) + }) + + It("Read numa node id with out error", func() { + nodeID, err := readNUMAnodeIDFromFileFunc(workDir, deviceAddress1) + Expect(err).To(BeNil()) + Expect(nodeID).ToNot(BeNil()) + Expect(*nodeID).To(Equal(numaNodeID)) + }) + + It("Read numa node id from a missing location to throw error", func() { + os.Remove(filepath.Join(workDir, deviceAddress1, "numa_node")) + + nodeID, err := readNUMAnodeIDFromFileFunc(workDir, deviceAddress1) + Expect(err).NotTo(BeNil()) + var nilNumaNodeID *int + Expect(nodeID).To(Equal(nilNumaNodeID)) + }) + + It("Incorrect value of numa node id to throw error", func() { + ioutil.WriteFile(filepath.Join(workDir, deviceAddress1, "numa_node"), []byte("incorrect\n"), 0644) + + nodeID, err := readNUMAnodeIDFromFileFunc(workDir, deviceAddress1) + Expect(err).NotTo(BeNil()) + var nilNumaNodeID *int + Expect(nodeID).To(Equal(nilNumaNodeID)) + }) + }) + Context("readVgpuIDFromFile() Tests", func() { BeforeEach(func() { readVgpuIDFromFile = readVgpuIDFromFileFunc @@ -272,6 +317,7 @@ var _ = Describe("Device Plugin", func() { It("", func() { readLink = getFakeLinkDevicePlugin readIDFromFile = getFakeIDFromFileDevicePlugin + readNUMAnodeIDFromFile = getFakeNUMAnodeIDFromDevicePlugin startDevicePlugin = fakeStartDevicePluginFunc createIommuDeviceMap() diff --git a/pkg/device_plugin/generic_device_plugin_test.go b/pkg/device_plugin/generic_device_plugin_test.go index cbafde70..f4597479 100644 --- a/pkg/device_plugin/generic_device_plugin_test.go +++ b/pkg/device_plugin/generic_device_plugin_test.go @@ -50,6 +50,7 @@ var pciAddress1 = "11" var pciAddress2 = "22" var pciAddress3 = "33" var nvVendorID = "10de" +var numaNodeID = 0 type fakeDevicePluginListAndWatchServer struct { grpc.ServerStream