feat(nvidia/gsp-firmware-mode): initial commit to track GSP modes, fi…

…x no data handling (#162) * feat(nvidia/gsp-firmware-mode): initial commit to track GSP modes Signed-off-by: Gyuho Lee <[email protected]> * track supported field Signed-off-by: Gyuho Lee <[email protected]> * track supported in reason Signed-off-by: Gyuho Lee <[email protected]> * fix Healthy false for no data Signed-off-by: Gyuho Lee <[email protected]> --------- Signed-off-by: Gyuho Lee <[email protected]>
leptonai · Nov 7, 2024 · c9fba83 · c9fba83
1 parent e7b13b3
commit c9fba83
Show file tree

Hide file tree

Showing 13 changed files with 302 additions and 3 deletions.
diff --git a/components/accelerator/nvidia/bad-envs/component.go b/components/accelerator/nvidia/bad-envs/component.go
@@ -43,7 +43,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
 		return []components.State{
 			{
 				Name:    bad_envs_id.Name,
-				Healthy: false,
+				Healthy: true,
 				Error:   query.ErrNoData.Error(),
 				Reason:  query.ErrNoData.Error(),
 			},

diff --git a/components/accelerator/nvidia/gsp-firmware-mode/component.go b/components/accelerator/nvidia/gsp-firmware-mode/component.go
@@ -0,0 +1,99 @@
+// Package gspfirmwaremode tracks the NVIDIA GSP firmware mode.
+package gspfirmwaremode
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/leptonai/gpud/components"
+	nvidia_gsp_firmware_mode_id "github.com/leptonai/gpud/components/accelerator/nvidia/gsp-firmware-mode/id"
+	nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
+	"github.com/leptonai/gpud/components/query"
+	"github.com/leptonai/gpud/log"
+)
+
+func New(ctx context.Context, cfg Config) components.Component {
+	cfg.Query.SetDefaultsIfNotSet()
+
+	cctx, ccancel := context.WithCancel(ctx)
+	nvidia_query.DefaultPoller.Start(cctx, cfg.Query, nvidia_gsp_firmware_mode_id.Name)
+
+	return &component{
+		rootCtx: ctx,
+		cancel:  ccancel,
+		poller:  nvidia_query.DefaultPoller,
+	}
+}
+
+var _ components.Component = (*component)(nil)
+
+type component struct {
+	rootCtx context.Context
+	cancel  context.CancelFunc
+	poller  query.Poller
+}
+
+func (c *component) Name() string { return nvidia_gsp_firmware_mode_id.Name }
+
+func (c *component) States(ctx context.Context) ([]components.State, error) {
+	last, err := c.poller.Last()
+	if err == query.ErrNoData { // no data
+		log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", nvidia_gsp_firmware_mode_id.Name)
+		return []components.State{
+			{
+				Name:    nvidia_gsp_firmware_mode_id.Name,
+				Healthy: true,
+				Error:   query.ErrNoData.Error(),
+				Reason:  query.ErrNoData.Error(),
+			},
+		}, nil
+	}
+	if err != nil {
+		return nil, err
+	}
+	if last.Error != nil {
+		return []components.State{
+			{
+				Healthy: false,
+				Error:   last.Error.Error(),
+				Reason:  "last query failed",
+			},
+		}, nil
+	}
+	if last.Output == nil {
+		return []components.State{
+			{
+				Healthy: false,
+				Reason:  "no output",
+			},
+		}, nil
+	}
+
+	allOutput, ok := last.Output.(*nvidia_query.Output)
+	if !ok {
+		return nil, fmt.Errorf("invalid output type: %T", last.Output)
+	}
+
+	output := ToOutput(allOutput)
+	return output.States()
+}
+
+func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) {
+	return nil, nil
+}
+
+func (c *component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error) {
+	log.Logger.Debugw("querying metrics", "since", since)
+
+	return nil, nil
+}
+
+func (c *component) Close() error {
+	log.Logger.Debugw("closing component")
+
+	// safe to call stop multiple times
+	_ = c.poller.Stop(nvidia_gsp_firmware_mode_id.Name)
+
+	return nil
+}
diff --git a/components/accelerator/nvidia/gsp-firmware-mode/component_output.go b/components/accelerator/nvidia/gsp-firmware-mode/component_output.go
@@ -0,0 +1,100 @@
+package gspfirmwaremode
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"strings"
+
+	"github.com/leptonai/gpud/components"
+	nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
+	nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml"
+)
+
+// ToOutput converts nvidia_query.Output to Output.
+// It returns an empty non-nil object, if the input or the required field is nil (e.g., i.SMI).
+func ToOutput(i *nvidia_query.Output) *Output {
+	if i == nil {
+		return &Output{}
+	}
+
+	o := &Output{}
+	if i.NVML != nil {
+		for _, device := range i.NVML.DeviceInfos {
+			o.GSPFirmwareModesNVML = append(o.GSPFirmwareModesNVML, device.GSPFirmwareMode)
+		}
+	}
+
+	return o
+}
+
+type Output struct {
+	GSPFirmwareModesNVML []nvidia_query_nvml.GSPFirmwareMode `json:"gsp_firmware_modes_nvml"`
+}
+
+func (o *Output) JSON() ([]byte, error) {
+	return json.Marshal(o)
+}
+
+func ParseOutputJSON(data []byte) (*Output, error) {
+	o := new(Output)
+	if err := json.Unmarshal(data, o); err != nil {
+		return nil, err
+	}
+	return o, nil
+}
+
+const (
+	StateNameGSPFirmwareMode = "gsp_firmware_mode"
+
+	StateKeyGSPFirmwareModeData       = "data"
+	StateKeyGSPFirmwareModeEncoding   = "encoding"
+	StateValueMemoryUsageEncodingJSON = "json"
+)
+
+func ParseStatePersistenceMode(m map[string]string) (*Output, error) {
+	data := m[StateKeyGSPFirmwareModeData]
+	return ParseOutputJSON([]byte(data))
+}
+
+func ParseStatesToOutput(states ...components.State) (*Output, error) {
+	for _, state := range states {
+		switch state.Name {
+		case StateNameGSPFirmwareMode:
+			o, err := ParseStatePersistenceMode(state.ExtraInfo)
+			if err != nil {
+				return nil, err
+			}
+			return o, nil
+
+		default:
+			return nil, fmt.Errorf("unknown state name: %s", state.Name)
+		}
+	}
+	return nil, errors.New("no state found")
+}
+
+func (o *Output) States() ([]components.State, error) {
+	reasons := []string{}
+	for _, mode := range o.GSPFirmwareModesNVML {
+		if !mode.Enabled {
+			reasons = append(reasons, fmt.Sprintf("device %s does not enable GSP firmware mode (GSP mode supported: %v)", mode.UUID, mode.Supported))
+		}
+	}
+	reason := "GSP firmware mode is disabled for all devices"
+	if len(reasons) > 0 {
+		reason = strings.Join(reasons, "; ")
+	}
+
+	b, _ := o.JSON()
+	state := components.State{
+		Name:    StateNameGSPFirmwareMode,
+		Healthy: true,
+		Reason:  reason,
+		ExtraInfo: map[string]string{
+			StateKeyGSPFirmwareModeData:     string(b),
+			StateKeyGSPFirmwareModeEncoding: StateValueMemoryUsageEncodingJSON,
+		},
+	}
+	return []components.State{state}, nil
+}
diff --git a/components/accelerator/nvidia/gsp-firmware-mode/config.go b/components/accelerator/nvidia/gsp-firmware-mode/config.go
@@ -0,0 +1,32 @@
+package gspfirmwaremode
+
+import (
+	"database/sql"
+	"encoding/json"
+
+	query_config "github.com/leptonai/gpud/components/query/config"
+)
+
+type Config struct {
+	Query query_config.Config `json:"query"`
+}
+
+func ParseConfig(b any, db *sql.DB) (*Config, error) {
+	raw, err := json.Marshal(b)
+	if err != nil {
+		return nil, err
+	}
+	cfg := new(Config)
+	err = json.Unmarshal(raw, cfg)
+	if err != nil {
+		return nil, err
+	}
+	if cfg.Query.State != nil {
+		cfg.Query.State.DB = db
+	}
+	return cfg, nil
+}
+
+func (cfg Config) Validate() error {
+	return nil
+}
diff --git a/components/accelerator/nvidia/gsp-firmware-mode/id/id.go b/components/accelerator/nvidia/gsp-firmware-mode/id/id.go
@@ -0,0 +1,4 @@
+// Package id defines the GSP firmware component ID.
+package id
+
+const Name = "accelerator-nvidia-gsp-firmware"
diff --git a/components/accelerator/nvidia/peermem/component.go b/components/accelerator/nvidia/peermem/component.go
@@ -46,7 +46,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
 		return []components.State{
 			{
 				Name:    nvidia_peermem_id.Name,
-				Healthy: false,
+				Healthy: true,
 				Error:   query.ErrNoData.Error(),
 				Reason:  query.ErrNoData.Error(),
 			},

diff --git a/components/accelerator/nvidia/persistence-mode/component.go b/components/accelerator/nvidia/persistence-mode/component.go
@@ -43,7 +43,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
 		return []components.State{
 			{
 				Name:    nvidia_persistence_mode_id.Name,
-				Healthy: false,
+				Healthy: true,
 				Error:   query.ErrNoData.Error(),
 				Reason:  query.ErrNoData.Error(),
 			},

diff --git a/components/accelerator/nvidia/query/nvml/gsp_firmware.go b/components/accelerator/nvidia/query/nvml/gsp_firmware.go
@@ -0,0 +1,32 @@
+package nvml
+
+import (
+	"fmt"
+
+	"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+)
+
+// GSPFirmwareMode is the GSP firmware mode of the device.
+// ref. https://www.nvidia.com.tw/Download/driverResults.aspx/224886/tw
+// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g37f644e70bd4853a78ca2bbf70861f67
+type GSPFirmwareMode struct {
+	UUID      string `json:"uuid"`
+	Enabled   bool   `json:"enabled"`
+	Supported bool   `json:"supported"`
+}
+
+func GetGSPFirmwareMode(uuid string, dev device.Device) (GSPFirmwareMode, error) {
+	mode := GSPFirmwareMode{
+		UUID: uuid,
+	}
+
+	gspEnabled, supported, ret := dev.GetGspFirmwareMode()
+	if ret != nvml.SUCCESS {
+		return GSPFirmwareMode{}, fmt.Errorf("failed to get gsp firmware mode: %v", nvml.ErrorString(ret))
+	}
+	mode.Enabled = gspEnabled
+	mode.Supported = supported
+
+	return mode, nil
+}
diff --git a/components/accelerator/nvidia/query/nvml/nvml.go b/components/accelerator/nvidia/query/nvml/nvml.go
@@ -103,6 +103,7 @@ type DeviceInfo struct {
 	// Set true if the device supports GPM metrics.
 	GPMMetricsSupported bool `json:"gpm_metrics_supported"`
 
+	GSPFirmwareMode GSPFirmwareMode `json:"gsp_firmware_mode"`
 	PersistenceMode PersistenceMode `json:"persistence_mode"`
 	ClockEvents     *ClockEvents    `json:"clock_events,omitempty"`
 	ClockSpeed      ClockSpeed      `json:"clock_speed"`
@@ -415,6 +416,12 @@ func (inst *instance) Get() (*Output, error) {
 		st.DeviceInfos = append(st.DeviceInfos, latestInfo)
 
 		var err error
+
+		latestInfo.GSPFirmwareMode, err = GetGSPFirmwareMode(devInfo.UUID, devInfo.device)
+		if err != nil {
+			return st, err
+		}
+
 		latestInfo.PersistenceMode, err = GetPersistenceMode(devInfo.UUID, devInfo.device)
 		if err != nil {
 			return st, err

diff --git a/components/accelerator/nvidia/query/query.go b/components/accelerator/nvidia/query/query.go
@@ -512,6 +512,12 @@ func (o *Output) PrintInfo(debug bool) {
 		for _, dev := range o.NVML.DeviceInfos {
 			fmt.Printf("\n\n##################\nNVML %s\n\n", dev.UUID)
 
+			if dev.GSPFirmwareMode.Enabled {
+				fmt.Printf("%s NVML GSP firmware mode is enabled (supported: %v)\n", checkMark, dev.GSPFirmwareMode.Supported)
+			} else {
+				fmt.Printf("%s NVML GSP firmware mode is disabled (supported: %v)\n", warningSign, dev.GSPFirmwareMode.Supported)
+			}
+
 			// ref. https://docs.nvidia.com/deploy/driver-persistence/index.html
 			if dev.PersistenceMode.Enabled {
 				fmt.Printf("%s NVML persistence mode is enabled (nvidia-persistenced running %v)\n", checkMark, o.PersistencedRunning)

diff --git a/config/default.go b/config/default.go
@@ -17,6 +17,7 @@ import (
 	nvidia_error_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid"
 	nvidia_fabric_manager "github.com/leptonai/gpud/components/accelerator/nvidia/fabric-manager"
 	nvidia_gpm "github.com/leptonai/gpud/components/accelerator/nvidia/gpm"
+	nvidia_gsp_firmware_mode_id "github.com/leptonai/gpud/components/accelerator/nvidia/gsp-firmware-mode/id"
 	nvidia_infiniband "github.com/leptonai/gpud/components/accelerator/nvidia/infiniband"
 	nvidia_info "github.com/leptonai/gpud/components/accelerator/nvidia/info"
 	nvidia_memory "github.com/leptonai/gpud/components/accelerator/nvidia/memory"
@@ -272,6 +273,7 @@ func DefaultConfig(ctx context.Context, opts ...OpOption) (*Config, error) {
 		cfg.Components[nvidia_nccl_id.Name] = nil
 		cfg.Components[nvidia_peermem_id.Name] = nil
 		cfg.Components[nvidia_persistence_mode_id.Name] = nil
+		cfg.Components[nvidia_gsp_firmware_mode_id.Name] = nil
 	} else {
 		log.Logger.Debugw("auto-detect nvidia not supported -- skipping", "os", runtime.GOOS)
 	}

diff --git a/docs/COMPONENTS.md b/docs/COMPONENTS.md
@@ -10,6 +10,7 @@
 - [**`accelerator-nvidia-error-sxid`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid): Tracks the NVIDIA GPU SXid errors scanning the dmesg -- see [fabric manager documentation](https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf).
 - [**`accelerator-nvidia-error-xid`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/error/xid): Tracks the NVIDIA GPU Xid errors scanning the dmesg and using the NVIDIA Management Library (NVML) -- see [Xid messages](https://docs.nvidia.com/deploy/gpu-debug-guidelines/index.html#xid-messages).
 - [**`accelerator-nvidia-fabric-manager`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/fabric-manager): Tracks the fabric manager version and its activeness.
+- [**`accelerator-nvidia-gsp-firmware`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/fabric-manager): Tracks the GSP firmware mode.
 - [**`accelerator-nvidia-infiniband`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/infiniband): Monitors the infiniband status of the system. Optional, enabled if the host has NVIDIA GPUs.
 - [**`accelerator-nvidia-info`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/info): Serves relatively static information about the NVIDIA accelerators (e.g., GPU product names).
 - [**`accelerator-nvidia-memory`**](https://pkg.go.dev/github.com/leptonai/gpud/components/accelerator/nvidia/memory): Monitors the NVIDIA per-GPU memory usage.

diff --git a/internal/server/server.go b/internal/server/server.go
@@ -41,6 +41,8 @@ import (
 	nvidia_error_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid"
 	nvidia_fabric_manager "github.com/leptonai/gpud/components/accelerator/nvidia/fabric-manager"
 	nvidia_gpm "github.com/leptonai/gpud/components/accelerator/nvidia/gpm"
+	nvidia_gsp_firmware_mode "github.com/leptonai/gpud/components/accelerator/nvidia/gsp-firmware-mode"
+	nvidia_gsp_firmware_mode_id "github.com/leptonai/gpud/components/accelerator/nvidia/gsp-firmware-mode/id"
 	nvidia_infiniband "github.com/leptonai/gpud/components/accelerator/nvidia/infiniband"
 	nvidia_info "github.com/leptonai/gpud/components/accelerator/nvidia/info"
 	nvidia_memory "github.com/leptonai/gpud/components/accelerator/nvidia/memory"
@@ -611,6 +613,20 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID
 			}
 			allComponents = append(allComponents, fabricManagerLogComponent)
 
+		case nvidia_gsp_firmware_mode_id.Name:
+			cfg := nvidia_gsp_firmware_mode.Config{Query: defaultQueryCfg}
+			if configValue != nil {
+				parsed, err := nvidia_gsp_firmware_mode.ParseConfig(configValue, db)
+				if err != nil {
+					return nil, fmt.Errorf("failed to parse component %s config: %w", k, err)
+				}
+				cfg = *parsed
+			}
+			if err := cfg.Validate(); err != nil {
+				return nil, fmt.Errorf("failed to validate component %s config: %w", k, err)
+			}
+			allComponents = append(allComponents, nvidia_gsp_firmware_mode.New(ctx, cfg))
+
 		case nvidia_infiniband.Name:
 			cfg := nvidia_infiniband.Config{Query: defaultQueryCfg}
 			if configValue != nil {