Skip to content

Commit

Permalink
feat(nvidia/gsp-firmware-mode): initial commit to track GSP modes, fi…
Browse files Browse the repository at this point in the history
…x no data handling (#162)

* feat(nvidia/gsp-firmware-mode): initial commit to track GSP modes

Signed-off-by: Gyuho Lee <[email protected]>

* track supported field

Signed-off-by: Gyuho Lee <[email protected]>

* track supported in reason

Signed-off-by: Gyuho Lee <[email protected]>

* fix Healthy false for no data

Signed-off-by: Gyuho Lee <[email protected]>

---------

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho authored Nov 7, 2024
1 parent e7b13b3 commit c9fba83
Show file tree
Hide file tree
Showing 13 changed files with 302 additions and 3 deletions.
2 changes: 1 addition & 1 deletion components/accelerator/nvidia/bad-envs/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Name: bad_envs_id.Name,
Healthy: false,
Healthy: true,
Error: query.ErrNoData.Error(),
Reason: query.ErrNoData.Error(),
},
Expand Down
99 changes: 99 additions & 0 deletions components/accelerator/nvidia/gsp-firmware-mode/component.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Package gspfirmwaremode tracks the NVIDIA GSP firmware mode.
package gspfirmwaremode

import (
"context"
"fmt"
"time"

"github.com/leptonai/gpud/components"
nvidia_gsp_firmware_mode_id "github.com/leptonai/gpud/components/accelerator/nvidia/gsp-firmware-mode/id"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"
)

func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, nvidia_gsp_firmware_mode_id.Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
}
}

var _ components.Component = (*component)(nil)

type component struct {
rootCtx context.Context
cancel context.CancelFunc
poller query.Poller
}

func (c *component) Name() string { return nvidia_gsp_firmware_mode_id.Name }

func (c *component) States(ctx context.Context) ([]components.State, error) {
last, err := c.poller.Last()
if err == query.ErrNoData { // no data
log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", nvidia_gsp_firmware_mode_id.Name)
return []components.State{
{
Name: nvidia_gsp_firmware_mode_id.Name,
Healthy: true,
Error: query.ErrNoData.Error(),
Reason: query.ErrNoData.Error(),
},
}, nil
}
if err != nil {
return nil, err
}
if last.Error != nil {
return []components.State{
{
Healthy: false,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
}
if last.Output == nil {
return []components.State{
{
Healthy: false,
Reason: "no output",
},
}, nil
}

allOutput, ok := last.Output.(*nvidia_query.Output)
if !ok {
return nil, fmt.Errorf("invalid output type: %T", last.Output)
}

output := ToOutput(allOutput)
return output.States()
}

func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) {
return nil, nil
}

func (c *component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error) {
log.Logger.Debugw("querying metrics", "since", since)

return nil, nil
}

func (c *component) Close() error {
log.Logger.Debugw("closing component")

// safe to call stop multiple times
_ = c.poller.Stop(nvidia_gsp_firmware_mode_id.Name)

return nil
}
100 changes: 100 additions & 0 deletions components/accelerator/nvidia/gsp-firmware-mode/component_output.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package gspfirmwaremode

import (
"encoding/json"
"errors"
"fmt"
"strings"

"github.com/leptonai/gpud/components"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml"
)

// ToOutput converts nvidia_query.Output to Output.
// It returns an empty non-nil object, if the input or the required field is nil (e.g., i.SMI).
func ToOutput(i *nvidia_query.Output) *Output {
if i == nil {
return &Output{}
}

o := &Output{}
if i.NVML != nil {
for _, device := range i.NVML.DeviceInfos {
o.GSPFirmwareModesNVML = append(o.GSPFirmwareModesNVML, device.GSPFirmwareMode)
}
}

return o
}

type Output struct {
GSPFirmwareModesNVML []nvidia_query_nvml.GSPFirmwareMode `json:"gsp_firmware_modes_nvml"`
}

func (o *Output) JSON() ([]byte, error) {
return json.Marshal(o)
}

func ParseOutputJSON(data []byte) (*Output, error) {
o := new(Output)
if err := json.Unmarshal(data, o); err != nil {
return nil, err
}
return o, nil
}

const (
StateNameGSPFirmwareMode = "gsp_firmware_mode"

StateKeyGSPFirmwareModeData = "data"
StateKeyGSPFirmwareModeEncoding = "encoding"
StateValueMemoryUsageEncodingJSON = "json"
)

func ParseStatePersistenceMode(m map[string]string) (*Output, error) {
data := m[StateKeyGSPFirmwareModeData]
return ParseOutputJSON([]byte(data))
}

func ParseStatesToOutput(states ...components.State) (*Output, error) {
for _, state := range states {
switch state.Name {
case StateNameGSPFirmwareMode:
o, err := ParseStatePersistenceMode(state.ExtraInfo)
if err != nil {
return nil, err
}
return o, nil

default:
return nil, fmt.Errorf("unknown state name: %s", state.Name)
}
}
return nil, errors.New("no state found")
}

func (o *Output) States() ([]components.State, error) {
reasons := []string{}
for _, mode := range o.GSPFirmwareModesNVML {
if !mode.Enabled {
reasons = append(reasons, fmt.Sprintf("device %s does not enable GSP firmware mode (GSP mode supported: %v)", mode.UUID, mode.Supported))
}
}
reason := "GSP firmware mode is disabled for all devices"
if len(reasons) > 0 {
reason = strings.Join(reasons, "; ")
}

b, _ := o.JSON()
state := components.State{
Name: StateNameGSPFirmwareMode,
Healthy: true,
Reason: reason,
ExtraInfo: map[string]string{
StateKeyGSPFirmwareModeData: string(b),
StateKeyGSPFirmwareModeEncoding: StateValueMemoryUsageEncodingJSON,
},
}
return []components.State{state}, nil
}
32 changes: 32 additions & 0 deletions components/accelerator/nvidia/gsp-firmware-mode/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package gspfirmwaremode

import (
"database/sql"
"encoding/json"

query_config "github.com/leptonai/gpud/components/query/config"
)

type Config struct {
Query query_config.Config `json:"query"`
}

func ParseConfig(b any, db *sql.DB) (*Config, error) {
raw, err := json.Marshal(b)
if err != nil {
return nil, err
}
cfg := new(Config)
err = json.Unmarshal(raw, cfg)
if err != nil {
return nil, err
}
if cfg.Query.State != nil {
cfg.Query.State.DB = db
}
return cfg, nil
}

func (cfg Config) Validate() error {
return nil
}
4 changes: 4 additions & 0 deletions components/accelerator/nvidia/gsp-firmware-mode/id/id.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// Package id defines the GSP firmware component ID.
package id

const Name = "accelerator-nvidia-gsp-firmware"
2 changes: 1 addition & 1 deletion components/accelerator/nvidia/peermem/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Name: nvidia_peermem_id.Name,
Healthy: false,
Healthy: true,
Error: query.ErrNoData.Error(),
Reason: query.ErrNoData.Error(),
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Name: nvidia_persistence_mode_id.Name,
Healthy: false,
Healthy: true,
Error: query.ErrNoData.Error(),
Reason: query.ErrNoData.Error(),
},
Expand Down
32 changes: 32 additions & 0 deletions components/accelerator/nvidia/query/nvml/gsp_firmware.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package nvml

import (
"fmt"

"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)

// GSPFirmwareMode is the GSP firmware mode of the device.
// ref. https://www.nvidia.com.tw/Download/driverResults.aspx/224886/tw
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g37f644e70bd4853a78ca2bbf70861f67
type GSPFirmwareMode struct {
UUID string `json:"uuid"`
Enabled bool `json:"enabled"`
Supported bool `json:"supported"`
}

func GetGSPFirmwareMode(uuid string, dev device.Device) (GSPFirmwareMode, error) {
mode := GSPFirmwareMode{
UUID: uuid,
}

gspEnabled, supported, ret := dev.GetGspFirmwareMode()
if ret != nvml.SUCCESS {
return GSPFirmwareMode{}, fmt.Errorf("failed to get gsp firmware mode: %v", nvml.ErrorString(ret))
}
mode.Enabled = gspEnabled
mode.Supported = supported

return mode, nil
}
7 changes: 7 additions & 0 deletions components/accelerator/nvidia/query/nvml/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ type DeviceInfo struct {
// Set true if the device supports GPM metrics.
GPMMetricsSupported bool `json:"gpm_metrics_supported"`

GSPFirmwareMode GSPFirmwareMode `json:"gsp_firmware_mode"`
PersistenceMode PersistenceMode `json:"persistence_mode"`
ClockEvents *ClockEvents `json:"clock_events,omitempty"`
ClockSpeed ClockSpeed `json:"clock_speed"`
Expand Down Expand Up @@ -415,6 +416,12 @@ func (inst *instance) Get() (*Output, error) {
st.DeviceInfos = append(st.DeviceInfos, latestInfo)

var err error

latestInfo.GSPFirmwareMode, err = GetGSPFirmwareMode(devInfo.UUID, devInfo.device)
if err != nil {
return st, err
}

latestInfo.PersistenceMode, err = GetPersistenceMode(devInfo.UUID, devInfo.device)
if err != nil {
return st, err
Expand Down
6 changes: 6 additions & 0 deletions components/accelerator/nvidia/query/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,12 @@ func (o *Output) PrintInfo(debug bool) {
for _, dev := range o.NVML.DeviceInfos {
fmt.Printf("\n\n##################\nNVML %s\n\n", dev.UUID)

if dev.GSPFirmwareMode.Enabled {
fmt.Printf("%s NVML GSP firmware mode is enabled (supported: %v)\n", checkMark, dev.GSPFirmwareMode.Supported)
} else {
fmt.Printf("%s NVML GSP firmware mode is disabled (supported: %v)\n", warningSign, dev.GSPFirmwareMode.Supported)
}

// ref. https://docs.nvidia.com/deploy/driver-persistence/index.html
if dev.PersistenceMode.Enabled {
fmt.Printf("%s NVML persistence mode is enabled (nvidia-persistenced running %v)\n", checkMark, o.PersistencedRunning)
Expand Down
2 changes: 2 additions & 0 deletions config/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
nvidia_error_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid"
nvidia_fabric_manager "github.com/leptonai/gpud/components/accelerator/nvidia/fabric-manager"
nvidia_gpm "github.com/leptonai/gpud/components/accelerator/nvidia/gpm"
nvidia_gsp_firmware_mode_id "github.com/leptonai/gpud/components/accelerator/nvidia/gsp-firmware-mode/id"
nvidia_infiniband "github.com/leptonai/gpud/components/accelerator/nvidia/infiniband"
nvidia_info "github.com/leptonai/gpud/components/accelerator/nvidia/info"
nvidia_memory "github.com/leptonai/gpud/components/accelerator/nvidia/memory"
Expand Down Expand Up @@ -272,6 +273,7 @@ func DefaultConfig(ctx context.Context, opts ...OpOption) (*Config, error) {
cfg.Components[nvidia_nccl_id.Name] = nil
cfg.Components[nvidia_peermem_id.Name] = nil
cfg.Components[nvidia_persistence_mode_id.Name] = nil
cfg.Components[nvidia_gsp_firmware_mode_id.Name] = nil
} else {
log.Logger.Debugw("auto-detect nvidia not supported -- skipping", "os", runtime.GOOS)
}
Expand Down
1 change: 1 addition & 0 deletions docs/COMPONENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- [**`accelerator-nvidia-error-sxid`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid): Tracks the NVIDIA GPU SXid errors scanning the dmesg -- see [fabric manager documentation](https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf).
- [**`accelerator-nvidia-error-xid`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/error/xid): Tracks the NVIDIA GPU Xid errors scanning the dmesg and using the NVIDIA Management Library (NVML) -- see [Xid messages](https://docs.nvidia.com/deploy/gpu-debug-guidelines/index.html#xid-messages).
- [**`accelerator-nvidia-fabric-manager`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/fabric-manager): Tracks the fabric manager version and its activeness.
- [**`accelerator-nvidia-gsp-firmware`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/fabric-manager): Tracks the GSP firmware mode.
- [**`accelerator-nvidia-infiniband`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/infiniband): Monitors the infiniband status of the system. Optional, enabled if the host has NVIDIA GPUs.
- [**`accelerator-nvidia-info`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/info): Serves relatively static information about the NVIDIA accelerators (e.g., GPU product names).
- [**`accelerator-nvidia-memory`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/memory): Monitors the NVIDIA per-GPU memory usage.
Expand Down
16 changes: 16 additions & 0 deletions internal/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ import (
nvidia_error_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid"
nvidia_fabric_manager "github.com/leptonai/gpud/components/accelerator/nvidia/fabric-manager"
nvidia_gpm "github.com/leptonai/gpud/components/accelerator/nvidia/gpm"
nvidia_gsp_firmware_mode "github.com/leptonai/gpud/components/accelerator/nvidia/gsp-firmware-mode"
nvidia_gsp_firmware_mode_id "github.com/leptonai/gpud/components/accelerator/nvidia/gsp-firmware-mode/id"
nvidia_infiniband "github.com/leptonai/gpud/components/accelerator/nvidia/infiniband"
nvidia_info "github.com/leptonai/gpud/components/accelerator/nvidia/info"
nvidia_memory "github.com/leptonai/gpud/components/accelerator/nvidia/memory"
Expand Down Expand Up @@ -611,6 +613,20 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID
}
allComponents = append(allComponents, fabricManagerLogComponent)

case nvidia_gsp_firmware_mode_id.Name:
cfg := nvidia_gsp_firmware_mode.Config{Query: defaultQueryCfg}
if configValue != nil {
parsed, err := nvidia_gsp_firmware_mode.ParseConfig(configValue, db)
if err != nil {
return nil, fmt.Errorf("failed to parse component %s config: %w", k, err)
}
cfg = *parsed
}
if err := cfg.Validate(); err != nil {
return nil, fmt.Errorf("failed to validate component %s config: %w", k, err)
}
allComponents = append(allComponents, nvidia_gsp_firmware_mode.New(ctx, cfg))

case nvidia_infiniband.Name:
cfg := nvidia_infiniband.Config{Query: defaultQueryCfg}
if configValue != nil {
Expand Down

0 comments on commit c9fba83

Please sign in to comment.