Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(client): adding get states decode call, status command to check local gpud "/states", add sub-command aliases #211

Merged
merged 9 commits into from
Nov 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions client/v1/examples/get-states/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package main

import (
"context"
"errors"
"time"

client_v1 "github.com/leptonai/gpud/client/v1"
"github.com/leptonai/gpud/errdefs"
"github.com/leptonai/gpud/log"
)

func main() {
baseURL := "https://localhost:15132"
for _, componentName := range []string{"disk", "accelerator-nvidia-info"} {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
states, err := client_v1.GetStates(ctx, baseURL, client_v1.WithComponent(componentName))
if err != nil {
if errors.Is(err, errdefs.ErrNotFound) {
log.Logger.Warnw("component not found", "component", componentName)
return
}

log.Logger.Error("error fetching component info", "error", err)
return
}

for _, ss := range states {
for _, s := range ss.States {
log.Logger.Infof("state: %q, healthy: %v, extra info: %q\n", s.Name, s.Healthy, s.ExtraInfo)
}
}
}
}
2 changes: 1 addition & 1 deletion client/v1/v1.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ func GetStates(ctx context.Context, addr string, opts ...OpOption) (v1.LeptonSta
return nil, errors.New("server not ready, response not 200")
}

return nil, nil
return ReadStates(resp.Body, opts...)
}

func ReadStates(rd io.Reader, opts ...OpOption) (v1.LeptonStates, error) {
Expand Down
4 changes: 2 additions & 2 deletions cmd/gpud/command/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ sudo rm /etc/systemd/system/gpud.service
},
{
Name: "logs",
Aliases: []string{"l"},
Aliases: []string{"log", "l"},

Usage: "checks the gpud logs",
Action: cmdLogs,
Expand Down Expand Up @@ -499,7 +499,7 @@ cat summary.txt
},
{
Name: "scan",
Aliases: []string{"s"},
Aliases: []string{"check", "s"},

Usage: "quick scans the host for any major issues",
Action: cmdScan,
Expand Down
2 changes: 1 addition & 1 deletion cmd/gpud/command/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ func cmdRun(cliContext *cli.Context) error {

if pkd_systemd.SystemctlExists() {
if err := notifyReady(rootCtx); err != nil {
log.Logger.Warn("notify ready failed")
log.Logger.Warnw("notify ready failed")
}
} else {
log.Logger.Debugw("skipped sd notify as systemd is not available")
Expand Down
66 changes: 66 additions & 0 deletions cmd/gpud/command/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@ import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"time"

client "github.com/leptonai/gpud/client/v1"
"github.com/leptonai/gpud/config"
"github.com/leptonai/gpud/errdefs"
"github.com/leptonai/gpud/log"
"github.com/leptonai/gpud/manager/packages"
"github.com/leptonai/gpud/pkg/systemd"

Expand All @@ -34,6 +37,16 @@ func cmdStatus(cliContext *cli.Context) error {
}
fmt.Printf("%s successfully checked gpud status\n", checkMark)

if err := checkDiskComponent(); err != nil {
return err
}
fmt.Printf("%s successfully checked whether disk component is running\n", checkMark)

if err := checkNvidiaInfoComponent(); err != nil {
return err
}
fmt.Printf("%s successfully checked whether accelerator-nvidia-info component is running\n", checkMark)

if err := client.BlockUntilServerReady(
rootCtx,
fmt.Sprintf("https://localhost:%d", config.DefaultGPUdPort),
Expand Down Expand Up @@ -76,6 +89,59 @@ func cmdStatus(cliContext *cli.Context) error {
return nil
}

func checkDiskComponent() error {
baseURL := fmt.Sprintf("https://localhost:%d", config.DefaultGPUdPort)
componentName := "disk"

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
states, err := client.GetStates(ctx, baseURL, client.WithComponent(componentName))
if err != nil {
// assume disk component is enabled for all platforms
return err
}
if len(states) == 0 {
log.Logger.Warnw("empty state returned", "component", componentName)
return errors.New("empty state returned")
}

for _, ss := range states {
for _, s := range ss.States {
log.Logger.Infof("state: %q, healthy: %v, extra info: %q\n", s.Name, s.Healthy, s.ExtraInfo)
}
}

return nil
}

func checkNvidiaInfoComponent() error {
baseURL := fmt.Sprintf("https://localhost:%d", config.DefaultGPUdPort)
componentName := "accelerator-nvidia-info"

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
states, err := client.GetStates(ctx, baseURL, client.WithComponent(componentName))
if err != nil {
if errors.Is(err, errdefs.ErrNotFound) {
log.Logger.Warnw("component not found", "component", componentName)
return nil
}
return err
}
if len(states) == 0 {
log.Logger.Warnw("empty state returned", "component", componentName)
return errors.New("empty state returned")
}

for _, ss := range states {
for _, s := range ss.States {
log.Logger.Infof("state: %q, healthy: %v, extra info: %q\n", s.Name, s.Healthy, s.ExtraInfo)
}
}

return nil
}

func getStatus() ([]packages.PackageStatus, error) {
httpClient := &http.Client{
Transport: &http.Transport{
Expand Down
11 changes: 11 additions & 0 deletions e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,17 @@ func TestGpudHealthzInfo(t *testing.T) {
}
t.Logf("respMetrics size:\n%s", string(metricsBytes))

t.Log("now testing with client/v1 basic disk get states")
states, err := client_v1.GetStates(ctx, "https://"+ep, client_v1.WithComponent("disk"))
if err != nil {
t.Errorf("failed to get states: %v", err)
}
for _, ss := range states {
for _, s := range ss.States {
t.Logf("state: %q, healthy: %v, extra info: %q\n", s.Name, s.Healthy, s.ExtraInfo)
}
}

t.Log("now testing with client/v1")
for _, opts := range [][]client_v1.OpOption{
{client_v1.WithRequestContentTypeJSON()},
Expand Down