-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(nvidia/gsp-firmware-mode): initial commit to track GSP modes, fi…
…x no data handling (#162) * feat(nvidia/gsp-firmware-mode): initial commit to track GSP modes Signed-off-by: Gyuho Lee <[email protected]> * track supported field Signed-off-by: Gyuho Lee <[email protected]> * track supported in reason Signed-off-by: Gyuho Lee <[email protected]> * fix Healthy false for no data Signed-off-by: Gyuho Lee <[email protected]> --------- Signed-off-by: Gyuho Lee <[email protected]>
- Loading branch information
Showing
13 changed files
with
302 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
99 changes: 99 additions & 0 deletions
99
components/accelerator/nvidia/gsp-firmware-mode/component.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
// Package gspfirmwaremode tracks the NVIDIA GSP firmware mode. | ||
package gspfirmwaremode | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
"github.com/leptonai/gpud/components" | ||
nvidia_gsp_firmware_mode_id "github.com/leptonai/gpud/components/accelerator/nvidia/gsp-firmware-mode/id" | ||
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" | ||
"github.com/leptonai/gpud/components/query" | ||
"github.com/leptonai/gpud/log" | ||
) | ||
|
||
func New(ctx context.Context, cfg Config) components.Component { | ||
cfg.Query.SetDefaultsIfNotSet() | ||
|
||
cctx, ccancel := context.WithCancel(ctx) | ||
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, nvidia_gsp_firmware_mode_id.Name) | ||
|
||
return &component{ | ||
rootCtx: ctx, | ||
cancel: ccancel, | ||
poller: nvidia_query.DefaultPoller, | ||
} | ||
} | ||
|
||
var _ components.Component = (*component)(nil) | ||
|
||
type component struct { | ||
rootCtx context.Context | ||
cancel context.CancelFunc | ||
poller query.Poller | ||
} | ||
|
||
func (c *component) Name() string { return nvidia_gsp_firmware_mode_id.Name } | ||
|
||
func (c *component) States(ctx context.Context) ([]components.State, error) { | ||
last, err := c.poller.Last() | ||
if err == query.ErrNoData { // no data | ||
log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", nvidia_gsp_firmware_mode_id.Name) | ||
return []components.State{ | ||
{ | ||
Name: nvidia_gsp_firmware_mode_id.Name, | ||
Healthy: true, | ||
Error: query.ErrNoData.Error(), | ||
Reason: query.ErrNoData.Error(), | ||
}, | ||
}, nil | ||
} | ||
if err != nil { | ||
return nil, err | ||
} | ||
if last.Error != nil { | ||
return []components.State{ | ||
{ | ||
Healthy: false, | ||
Error: last.Error.Error(), | ||
Reason: "last query failed", | ||
}, | ||
}, nil | ||
} | ||
if last.Output == nil { | ||
return []components.State{ | ||
{ | ||
Healthy: false, | ||
Reason: "no output", | ||
}, | ||
}, nil | ||
} | ||
|
||
allOutput, ok := last.Output.(*nvidia_query.Output) | ||
if !ok { | ||
return nil, fmt.Errorf("invalid output type: %T", last.Output) | ||
} | ||
|
||
output := ToOutput(allOutput) | ||
return output.States() | ||
} | ||
|
||
func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) { | ||
return nil, nil | ||
} | ||
|
||
func (c *component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error) { | ||
log.Logger.Debugw("querying metrics", "since", since) | ||
|
||
return nil, nil | ||
} | ||
|
||
func (c *component) Close() error { | ||
log.Logger.Debugw("closing component") | ||
|
||
// safe to call stop multiple times | ||
_ = c.poller.Stop(nvidia_gsp_firmware_mode_id.Name) | ||
|
||
return nil | ||
} |
100 changes: 100 additions & 0 deletions
100
components/accelerator/nvidia/gsp-firmware-mode/component_output.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
package gspfirmwaremode | ||
|
||
import ( | ||
"encoding/json" | ||
"errors" | ||
"fmt" | ||
"strings" | ||
|
||
"github.com/leptonai/gpud/components" | ||
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" | ||
nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml" | ||
) | ||
|
||
// ToOutput converts nvidia_query.Output to Output. | ||
// It returns an empty non-nil object, if the input or the required field is nil (e.g., i.SMI). | ||
func ToOutput(i *nvidia_query.Output) *Output { | ||
if i == nil { | ||
return &Output{} | ||
} | ||
|
||
o := &Output{} | ||
if i.NVML != nil { | ||
for _, device := range i.NVML.DeviceInfos { | ||
o.GSPFirmwareModesNVML = append(o.GSPFirmwareModesNVML, device.GSPFirmwareMode) | ||
} | ||
} | ||
|
||
return o | ||
} | ||
|
||
type Output struct { | ||
GSPFirmwareModesNVML []nvidia_query_nvml.GSPFirmwareMode `json:"gsp_firmware_modes_nvml"` | ||
} | ||
|
||
func (o *Output) JSON() ([]byte, error) { | ||
return json.Marshal(o) | ||
} | ||
|
||
func ParseOutputJSON(data []byte) (*Output, error) { | ||
o := new(Output) | ||
if err := json.Unmarshal(data, o); err != nil { | ||
return nil, err | ||
} | ||
return o, nil | ||
} | ||
|
||
const ( | ||
StateNameGSPFirmwareMode = "gsp_firmware_mode" | ||
|
||
StateKeyGSPFirmwareModeData = "data" | ||
StateKeyGSPFirmwareModeEncoding = "encoding" | ||
StateValueMemoryUsageEncodingJSON = "json" | ||
) | ||
|
||
func ParseStatePersistenceMode(m map[string]string) (*Output, error) { | ||
data := m[StateKeyGSPFirmwareModeData] | ||
return ParseOutputJSON([]byte(data)) | ||
} | ||
|
||
func ParseStatesToOutput(states ...components.State) (*Output, error) { | ||
for _, state := range states { | ||
switch state.Name { | ||
case StateNameGSPFirmwareMode: | ||
o, err := ParseStatePersistenceMode(state.ExtraInfo) | ||
if err != nil { | ||
return nil, err | ||
} | ||
return o, nil | ||
|
||
default: | ||
return nil, fmt.Errorf("unknown state name: %s", state.Name) | ||
} | ||
} | ||
return nil, errors.New("no state found") | ||
} | ||
|
||
func (o *Output) States() ([]components.State, error) { | ||
reasons := []string{} | ||
for _, mode := range o.GSPFirmwareModesNVML { | ||
if !mode.Enabled { | ||
reasons = append(reasons, fmt.Sprintf("device %s does not enable GSP firmware mode (GSP mode supported: %v)", mode.UUID, mode.Supported)) | ||
} | ||
} | ||
reason := "GSP firmware mode is disabled for all devices" | ||
if len(reasons) > 0 { | ||
reason = strings.Join(reasons, "; ") | ||
} | ||
|
||
b, _ := o.JSON() | ||
state := components.State{ | ||
Name: StateNameGSPFirmwareMode, | ||
Healthy: true, | ||
Reason: reason, | ||
ExtraInfo: map[string]string{ | ||
StateKeyGSPFirmwareModeData: string(b), | ||
StateKeyGSPFirmwareModeEncoding: StateValueMemoryUsageEncodingJSON, | ||
}, | ||
} | ||
return []components.State{state}, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package gspfirmwaremode | ||
|
||
import ( | ||
"database/sql" | ||
"encoding/json" | ||
|
||
query_config "github.com/leptonai/gpud/components/query/config" | ||
) | ||
|
||
type Config struct { | ||
Query query_config.Config `json:"query"` | ||
} | ||
|
||
func ParseConfig(b any, db *sql.DB) (*Config, error) { | ||
raw, err := json.Marshal(b) | ||
if err != nil { | ||
return nil, err | ||
} | ||
cfg := new(Config) | ||
err = json.Unmarshal(raw, cfg) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if cfg.Query.State != nil { | ||
cfg.Query.State.DB = db | ||
} | ||
return cfg, nil | ||
} | ||
|
||
func (cfg Config) Validate() error { | ||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
// Package id defines the GSP firmware component ID. | ||
package id | ||
|
||
const Name = "accelerator-nvidia-gsp-firmware" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package nvml | ||
|
||
import ( | ||
"fmt" | ||
|
||
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device" | ||
"github.com/NVIDIA/go-nvml/pkg/nvml" | ||
) | ||
|
||
// GSPFirmwareMode is the GSP firmware mode of the device. | ||
// ref. https://www.nvidia.com.tw/Download/driverResults.aspx/224886/tw | ||
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g37f644e70bd4853a78ca2bbf70861f67 | ||
type GSPFirmwareMode struct { | ||
UUID string `json:"uuid"` | ||
Enabled bool `json:"enabled"` | ||
Supported bool `json:"supported"` | ||
} | ||
|
||
func GetGSPFirmwareMode(uuid string, dev device.Device) (GSPFirmwareMode, error) { | ||
mode := GSPFirmwareMode{ | ||
UUID: uuid, | ||
} | ||
|
||
gspEnabled, supported, ret := dev.GetGspFirmwareMode() | ||
if ret != nvml.SUCCESS { | ||
return GSPFirmwareMode{}, fmt.Errorf("failed to get gsp firmware mode: %v", nvml.ErrorString(ret)) | ||
} | ||
mode.Enabled = gspEnabled | ||
mode.Supported = supported | ||
|
||
return mode, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters