Skip to content

Commit

Permalink
feat(nvidia): configurable nvidia-smi binary, ibstat binary, infiniba…
Browse files Browse the repository at this point in the history
…nd class dir paths for mock testing

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Jan 21, 2025
1 parent 45d6452 commit ec3b046
Show file tree
Hide file tree
Showing 66 changed files with 723 additions and 787 deletions.
57 changes: 56 additions & 1 deletion cmd/gpud/command/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ var (

dockerIgnoreConnectionErrors bool
kubeletIgnoreConnectionErrors bool

nvidiaSMICommand string
nvidiaSMIQueryCommand string
ibstatCommand string
infinibandClassDirectory string
)

const (
Expand All @@ -71,7 +76,6 @@ func App() *cli.App {
app.Description = "monitor your GPU/CPU machines and run workloads"

app.Commands = []cli.Command{

{
Name: "login",
Usage: "login gpud to lepton.ai (called automatically in gpud up with non-empty --token)",
Expand Down Expand Up @@ -263,6 +267,32 @@ sudo rm /etc/systemd/system/gpud.service
Usage: "ignore connection errors to kubelet read-only port, useful when kubelet readOnlyPort is disabled (default: false)",
Destination: &kubeletIgnoreConnectionErrors,
},

// only for testing
cli.StringFlag{
Name: "nvidia-smi-command",
Usage: "sets the nvidia-smi command (leave empty for default, useful for testing)",
Destination: &nvidiaSMICommand,
Hidden: true,
},
cli.StringFlag{
Name: "nvidia-smi-query-command",
Usage: "sets the nvidia-smi --query command (leave empty for default, useful for testing)",
Destination: &nvidiaSMIQueryCommand,
Hidden: true,
},
cli.StringFlag{
Name: "ibstat-command",
Usage: "sets the ibstat command (leave empty for default, useful for testing)",
Destination: &ibstatCommand,
Hidden: true,
},
cli.StringFlag{
Name: "infiniband-class-directory",
Usage: "sets the infiniband class directory (leave empty for default, useful for testing)",
Destination: &infinibandClassDirectory,
Hidden: true,
},
},
},

Expand Down Expand Up @@ -542,6 +572,31 @@ cat summary.txt
Usage: "enable dmesg checks (default: true)",
Destination: &dmesgCheck,
},
// only for testing
cli.StringFlag{
Name: "nvidia-smi-command",
Usage: "sets the nvidia-smi command (leave empty for default, useful for testing)",
Destination: &nvidiaSMICommand,
Hidden: true,
},
cli.StringFlag{
Name: "nvidia-smi-query-command",
Usage: "sets the nvidia-smi --query command (leave empty for default, useful for testing)",
Destination: &nvidiaSMIQueryCommand,
Hidden: true,
},
cli.StringFlag{
Name: "ibstat-command",
Usage: "sets the ibstat command (leave empty for default, useful for testing)",
Destination: &ibstatCommand,
Hidden: true,
},
cli.StringFlag{
Name: "infiniband-class-directory",
Usage: "sets the infiniband class directory (leave empty for default, useful for testing)",
Destination: &infinibandClassDirectory,
Hidden: true,
},
},
},
{
Expand Down
4 changes: 4 additions & 0 deletions cmd/gpud/command/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ func cmdScan(cliContext *cli.Context) error {
diagnose.WithNetcheck(netcheck),
diagnose.WithDiskcheck(diskcheck),
diagnose.WithDmesgCheck(dmesgCheck),
diagnose.WithNvidiaSMICommand(nvidiaSMICommand),
diagnose.WithNvidiaSMIQueryCommand(nvidiaSMIQueryCommand),
diagnose.WithIbstatCommand(ibstatCommand),
diagnose.WithInfinibandClassDirectory(infinibandClassDirectory),
}
if zapLvl.Level() <= zap.DebugLevel { // e.g., info, warn, error
diagnoseOpts = append(diagnoseOpts, diagnose.WithDebug(true))
Expand Down
12 changes: 10 additions & 2 deletions components/accelerator/nvidia/bad-envs/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,24 @@ import (

"github.com/leptonai/gpud/components"
bad_envs_id "github.com/leptonai/gpud/components/accelerator/nvidia/bad-envs/id"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"
)

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, bad_envs_id.Name)

return &component{
Expand Down
12 changes: 10 additions & 2 deletions components/accelerator/nvidia/clock-speed/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/leptonai/gpud/components"
nvidia_clock_speed_id "github.com/leptonai/gpud/components/accelerator/nvidia/clock-speed/id"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
nvidia_query_metrics_clockspeed "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/clock-speed"
"github.com/leptonai/gpud/components/query"
Expand All @@ -17,11 +18,18 @@ import (
"github.com/prometheus/client_golang/prometheus"
)

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_clock_speed_id.Name)

return &component{
Expand Down
33 changes: 0 additions & 33 deletions components/accelerator/nvidia/clock-speed/config.go

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package badenvs
package common

import (
"database/sql"
Expand All @@ -9,6 +9,15 @@ import (

type Config struct {
Query query_config.Config `json:"query"`

ToolOverwrites
}

type ToolOverwrites struct {
NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
12 changes: 10 additions & 2 deletions components/accelerator/nvidia/ecc/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"time"

"github.com/leptonai/gpud/components"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_ecc_id "github.com/leptonai/gpud/components/accelerator/nvidia/ecc/id"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
nvidia_query_metrics_ecc "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/ecc"
Expand All @@ -17,11 +18,18 @@ import (
"github.com/prometheus/client_golang/prometheus"
)

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_ecc_id.Name)

return &component{
Expand Down
33 changes: 0 additions & 33 deletions components/accelerator/nvidia/ecc/config.go

This file was deleted.

14 changes: 11 additions & 3 deletions components/accelerator/nvidia/error-xid-sxid/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/dustin/go-humanize"
"github.com/leptonai/gpud/components"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_error_xid_sxid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error-xid-sxid/id"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
nvidia_xid_sxid_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid-sxid-state"
Expand All @@ -18,12 +19,19 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

// this starts the Xid poller via "nvml.StartDefaultInstance"
cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_error_xid_sxid_id.Name)

return &component{
Expand All @@ -37,7 +45,7 @@ func New(ctx context.Context, cfg Config) components.Component {
var _ components.Component = (*component)(nil)

type component struct {
cfg Config
cfg nvidia_common.Config
rootCtx context.Context
cancel context.CancelFunc
poller query.Poller
Expand Down
33 changes: 0 additions & 33 deletions components/accelerator/nvidia/error-xid-sxid/config.go

This file was deleted.

12 changes: 10 additions & 2 deletions components/accelerator/nvidia/error/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,26 @@ import (
"time"

"github.com/leptonai/gpud/components"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"
)

const Name = "accelerator-nvidia-error"

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

return &component{
Expand Down
Loading

0 comments on commit ec3b046

Please sign in to comment.