Skip to content

Commit

Permalink
feat(nvidia): configurable nvidia-smi binary, ibstat binary, infiniba…
Browse files Browse the repository at this point in the history
…nd class dir paths for mock testing

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Jan 20, 2025
1 parent cd89e72 commit e077620
Show file tree
Hide file tree
Showing 60 changed files with 709 additions and 93 deletions.
58 changes: 57 additions & 1 deletion cmd/gpud/command/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ var (

dockerIgnoreConnectionErrors bool
kubeletIgnoreConnectionErrors bool

nvidiaSMICommand string
nvidiaSMIQueryCommand string
ibstatCommand string
infinibandClassDirectory string
)

const (
Expand All @@ -70,7 +75,6 @@ func App() *cli.App {
app.Description = "monitor your GPU/CPU machines and run workloads"

app.Commands = []cli.Command{

{
Name: "login",
Usage: "login gpud to lepton.ai (called automatically in gpud up with non-empty --token)",
Expand Down Expand Up @@ -262,6 +266,32 @@ sudo rm /etc/systemd/system/gpud.service
Usage: "ignore connection errors to kubelet read-only port, useful when kubelet readOnlyPort is disabled (default: false)",
Destination: &kubeletIgnoreConnectionErrors,
},

// only for testing
cli.StringFlag{
Name: "nvidia-smi-command",
Usage: "sets the nvidia-smi command (leave empty for default, useful for testing)",
Destination: &nvidiaSMICommand,
Hidden: true,
},
cli.StringFlag{
Name: "nvidia-smi-query-command",
Usage: "sets the nvidia-smi --query command (leave empty for default, useful for testing)",
Destination: &nvidiaSMIQueryCommand,
Hidden: true,
},
cli.StringFlag{
Name: "ibstat-command",
Usage: "sets the ibstat command (leave empty for default, useful for testing)",
Destination: &ibstatCommand,
Hidden: true,
},
cli.StringFlag{
Name: "infiniband-class-directory",
Usage: "sets the infiniband class directory (leave empty for default, useful for testing)",
Destination: &infinibandClassDirectory,
Hidden: true,
},
},
},

Expand Down Expand Up @@ -536,6 +566,32 @@ cat summary.txt
Usage: "enable disk checks (default: true)",
Destination: &diskcheck,
},

// only for testing
cli.StringFlag{
Name: "nvidia-smi-command",
Usage: "sets the nvidia-smi command (leave empty for default, useful for testing)",
Destination: &nvidiaSMICommand,
Hidden: true,
},
cli.StringFlag{
Name: "nvidia-smi-query-command",
Usage: "sets the nvidia-smi --query command (leave empty for default, useful for testing)",
Destination: &nvidiaSMIQueryCommand,
Hidden: true,
},
cli.StringFlag{
Name: "ibstat-command",
Usage: "sets the ibstat command (leave empty for default, useful for testing)",
Destination: &ibstatCommand,
Hidden: true,
},
cli.StringFlag{
Name: "infiniband-class-directory",
Usage: "sets the infiniband class directory (leave empty for default, useful for testing)",
Destination: &infinibandClassDirectory,
Hidden: true,
},
},
},
{
Expand Down
4 changes: 4 additions & 0 deletions cmd/gpud/command/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ func cmdScan(cliContext *cli.Context) error {
diagnose.WithPollGPMEvents(pollGPMEvents),
diagnose.WithNetcheck(netcheck),
diagnose.WithDiskcheck(diskcheck),
diagnose.WithNvidiaSMICommand(nvidiaSMICommand),
diagnose.WithNvidiaSMIQueryCommand(nvidiaSMIQueryCommand),
diagnose.WithIbstatCommand(ibstatCommand),
diagnose.WithInfinibandClassDirectory(infinibandClassDirectory),
}
if zapLvl.Level() <= zap.DebugLevel { // e.g., info, warn, error
diagnoseOpts = append(diagnoseOpts, diagnose.WithDebug(true))
Expand Down
9 changes: 8 additions & 1 deletion components/accelerator/nvidia/bad-envs/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,14 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, bad_envs_id.Name)

return &component{
Expand Down
5 changes: 5 additions & 0 deletions components/accelerator/nvidia/bad-envs/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ import (

type Config struct {
Query query_config.Config `json:"query"`

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
9 changes: 8 additions & 1 deletion components/accelerator/nvidia/clock-speed/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,14 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_clock_speed_id.Name)

return &component{
Expand Down
5 changes: 5 additions & 0 deletions components/accelerator/nvidia/clock-speed/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ import (

type Config struct {
Query query_config.Config `json:"query"`

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
9 changes: 8 additions & 1 deletion components/accelerator/nvidia/ecc/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,14 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_ecc_id.Name)

return &component{
Expand Down
5 changes: 5 additions & 0 deletions components/accelerator/nvidia/ecc/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ import (

type Config struct {
Query query_config.Config `json:"query"`

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
9 changes: 8 additions & 1 deletion components/accelerator/nvidia/error-xid-sxid/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,14 @@ func New(ctx context.Context, cfg Config) components.Component {

// this starts the Xid poller via "nvml.StartDefaultInstance"
cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_error_xid_sxid_id.Name)

return &component{
Expand Down
5 changes: 5 additions & 0 deletions components/accelerator/nvidia/error-xid-sxid/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ import (

type Config struct {
Query query_config.Config `json:"query"`

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
9 changes: 8 additions & 1 deletion components/accelerator/nvidia/error/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

return &component{
Expand Down
5 changes: 5 additions & 0 deletions components/accelerator/nvidia/error/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ import (

type Config struct {
Query query_config.Config `json:"query"`

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
5 changes: 5 additions & 0 deletions components/accelerator/nvidia/error/xid/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ import (

type Config struct {
Query query_config.Config `json:"query"`

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
9 changes: 8 additions & 1 deletion components/accelerator/nvidia/fabric-manager/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,14 @@ func New(ctx context.Context, cfg Config) (components.Component, error) {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Log.Query.State.DBRW, cfg.Log.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Log.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Log.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

if err := cfg.Log.Validate(); err != nil {
Expand Down
5 changes: 5 additions & 0 deletions components/accelerator/nvidia/fabric-manager/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ import (
type Config struct {
Query query_config.Config `json:"query"`
Log query_log_config.Config `json:"log"`

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
5 changes: 5 additions & 0 deletions components/accelerator/nvidia/gpm/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ import (

type Config struct {
Query query_config.Config `json:"query"`

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
9 changes: 8 additions & 1 deletion components/accelerator/nvidia/gsp-firmware-mode/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,14 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_gsp_firmware_mode_id.Name)

return &component{
Expand Down
5 changes: 5 additions & 0 deletions components/accelerator/nvidia/gsp-firmware-mode/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ import (

type Config struct {
Query query_config.Config `json:"query"`

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
9 changes: 8 additions & 1 deletion components/accelerator/nvidia/hw-slowdown/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,14 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_hw_slowdown_id.Name)

return &component{
Expand Down
5 changes: 5 additions & 0 deletions components/accelerator/nvidia/hw-slowdown/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ import (

type Config struct {
Query query_config.Config `json:"query"`

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
9 changes: 8 additions & 1 deletion components/accelerator/nvidia/infiniband/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_infiniband_id.Name)

return &component{
Expand Down
5 changes: 5 additions & 0 deletions components/accelerator/nvidia/infiniband/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ type Config struct {
Query query_config.Config `json:"query"`

ExpectedPortStates

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
}

// Configures the expected state of the ports.
Expand Down
9 changes: 8 additions & 1 deletion components/accelerator/nvidia/info/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
nvidia_query.SetDefaultPoller(
nvidia_query.WithDBRW(cfg.Query.State.DBRW),
nvidia_query.WithDBRO(cfg.Query.State.DBRO),
nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand),
nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand),
nvidia_query.WithIbstatCommand(cfg.IbstatCommand),
nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory),
)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

return &component{
Expand Down
Loading

0 comments on commit e077620

Please sign in to comment.