Skip to content

Commit

Permalink
Add basic support for GPUs that can be partitioned
Browse files Browse the repository at this point in the history
  • Loading branch information
y2kenny-amd committed Sep 10, 2024
1 parent ab22d52 commit 16b7f7a
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 4 deletions.
70 changes: 67 additions & 3 deletions internal/pkg/amdgpu/amdgpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,21 +95,52 @@ func GetAMDGPUs() map[string]map[string]int {
matches, _ := filepath.Glob("/sys/module/amdgpu/drivers/pci:amdgpu/[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]:*")

devices := make(map[string]map[string]int)
card, renderD := 0, 128

for _, path := range matches {
glog.Info(path)
devPaths, _ := filepath.Glob(path + "/drm/*")
devices[filepath.Base(path)] = make(map[string]int)

for _, devPath := range devPaths {
switch name := filepath.Base(devPath); {
case name[0:4] == "card":
devices[filepath.Base(path)][name[0:4]], _ = strconv.Atoi(name[4:])
card, _ = strconv.Atoi(name[4:])
case name[0:7] == "renderD":
devices[filepath.Base(path)][name[0:7]], _ = strconv.Atoi(name[7:])
renderD, _ = strconv.Atoi(name[7:])
}
}

devices[filepath.Base(path)] = map[string]int{"card": card, "renderD": renderD}
}

// certain products have additional devices (such as MI300's partitions)
//ex: /sys/devices/platform/amdgpu_xcp_30
platformMatches, _ := filepath.Glob("/sys/devices/platform/amdgpu_xcp_*")

// This is needed because some of the visible renderD are actually not valid
// Their validity depends on topology information from KFD
topoRenderNodes := renderNodeSetFromTopology()

for _, path := range platformMatches {
glog.Info(path)
devPaths, _ := filepath.Glob(path + "/drm/*")

for _, devPath := range devPaths {
switch name := filepath.Base(devPath); {
case name[0:4] == "card":
card, _ = strconv.Atoi(name[4:])
case name[0:7] == "renderD":
renderD, _ = strconv.Atoi(name[7:])
}
}

if !topoRenderNodes[renderD] {
continue
}

devices[filepath.Base(path)] = map[string]int{"card": card, "renderD": renderD}
}

return devices
}

Expand Down Expand Up @@ -274,3 +305,36 @@ func parseDebugFSFirmwareInfo(path string) (map[string]uint32, map[string]uint32

return feat, fw
}

func renderNodeSetFromTopology(topoRootParam ...string) map[int]bool {
topoRoot := "/sys/class/kfd/kfd"
if len(topoRootParam) == 1 {
topoRoot = topoRootParam[0]
}

renderNodes := make(map[int]bool)
var nodeFiles []string
var err error

if nodeFiles, err = filepath.Glob(topoRoot + "/topology/nodes/*/properties"); err != nil {
glog.Fatalf("glob error: %s", err)
return renderNodes
}

topoDrmRenderMinorRe := regexp.MustCompile(`drm_render_minor\s(\d+)`)
for _, nodeFile := range nodeFiles {
glog.Info("Parsing " + nodeFile)
v, e := ParseTopologyProperties(nodeFile, topoDrmRenderMinorRe)
if e != nil {
continue
}

if v <= 0 {
continue
}

renderNodes[int(v)] = true
}

return renderNodes
}
16 changes: 16 additions & 0 deletions internal/pkg/amdgpu/amdgpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
package amdgpu

import (
"encoding/json"
"fmt"
"io/ioutil"
"path/filepath"
"reflect"
"regexp"
"strings"
"testing"
Expand Down Expand Up @@ -213,3 +215,17 @@ func TestParseDebugFSFirmwareInfo(t *testing.T) {
t.Errorf("Incorrect parsing of amdgpu firmware info from debugfs")
}
}

func TestRenderNodeSetFromTopology(t *testing.T) {
renderNodes := renderNodeSetFromTopology("../../../testdata/topology-parsing")

expNodes := map[int]bool{128: true, 129: true}
if !reflect.DeepEqual(renderNodes, expNodes) {
val, _ := json.MarshalIndent(renderNodes, "", " ")
exp, _ := json.MarshalIndent(expNodes, "", " ")

t.Errorf("RenderNode set was incorrect")
t.Errorf("Got: %s", val)
t.Errorf("Want: %s", exp)
}
}
2 changes: 1 addition & 1 deletion testdata/topology-parsing/topology/nodes/2/properties
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ max_slots_scratch_cu 32
vendor_id 4098
device_id 26720
location_id 6400
drm_render_minor 128
drm_render_minor 129
max_engine_clk_fcompute 1500
local_mem_size 17163091968
fw_version 392
Expand Down

0 comments on commit 16b7f7a

Please sign in to comment.