From 6da0c79e7c10b0e7571cea6311dae76290cc8b26 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Mon, 20 Jan 2025 22:11:28 +0800 Subject: [PATCH] feat(nvidia): configurable nvidia-smi binary, ibstat binary, infiniband class dir paths for mock testing Signed-off-by: Gyuho Lee --- cmd/gpud/command/command.go | 58 ++++++++++++++- cmd/gpud/command/scan.go | 4 + .../accelerator/nvidia/bad-envs/component.go | 9 ++- .../accelerator/nvidia/bad-envs/config.go | 5 ++ .../nvidia/clock-speed/component.go | 9 ++- .../accelerator/nvidia/clock-speed/config.go | 5 ++ .../accelerator/nvidia/ecc/component.go | 9 ++- components/accelerator/nvidia/ecc/config.go | 5 ++ .../nvidia/error-xid-sxid/component.go | 9 ++- .../nvidia/error-xid-sxid/config.go | 5 ++ .../accelerator/nvidia/error/component.go | 9 ++- components/accelerator/nvidia/error/config.go | 5 ++ .../accelerator/nvidia/error/xid/config.go | 5 ++ .../nvidia/fabric-manager/component.go | 9 ++- .../nvidia/fabric-manager/config.go | 5 ++ components/accelerator/nvidia/gpm/config.go | 5 ++ .../nvidia/gsp-firmware-mode/component.go | 9 ++- .../nvidia/gsp-firmware-mode/config.go | 5 ++ .../nvidia/hw-slowdown/component.go | 9 ++- .../accelerator/nvidia/hw-slowdown/config.go | 5 ++ .../nvidia/infiniband/component.go | 9 ++- .../accelerator/nvidia/infiniband/config.go | 5 ++ .../accelerator/nvidia/info/component.go | 9 ++- components/accelerator/nvidia/info/config.go | 5 ++ .../accelerator/nvidia/memory/component.go | 9 ++- .../accelerator/nvidia/memory/config.go | 5 ++ .../accelerator/nvidia/nccl/component.go | 9 ++- components/accelerator/nvidia/nccl/config.go | 5 ++ .../accelerator/nvidia/nvlink/component.go | 9 ++- .../accelerator/nvidia/nvlink/config.go | 5 ++ .../accelerator/nvidia/peermem/component.go | 9 ++- .../accelerator/nvidia/peermem/config.go | 5 ++ .../nvidia/persistence-mode/component.go | 9 ++- .../nvidia/persistence-mode/config.go | 5 ++ .../accelerator/nvidia/power/component.go | 9 ++- components/accelerator/nvidia/power/config.go | 5 ++ .../accelerator/nvidia/processes/component.go | 9 ++- .../accelerator/nvidia/processes/config.go | 5 ++ .../nvidia/query/infiniband/ibstat.go | 41 +++++++++-- .../nvidia/query/infiniband/ibstat_test.go | 27 +++++++ .../nvidia/query/infiniband/infiniband.go | 22 +----- .../query/infiniband/infiniband_test.go | 4 +- .../nvidia/query/nvidia_smi_query.go | 42 ++++++----- .../nvidia/query/nvidia_smi_query_test.go | 32 ++++++++ .../accelerator/nvidia/query/options.go | 73 +++++++++++++++++++ .../nvidia/query/peermem/peermem.go | 7 +- components/accelerator/nvidia/query/query.go | 49 +++++++++---- .../nvidia-smi.525.125.06.out.0.valid | 11 +++ .../testdata/nvidia-smi.550.90.07.out.0.valid | 48 ++++++++++++ .../nvidia/remapped-rows/component.go | 9 ++- .../nvidia/remapped-rows/config.go | 5 ++ .../nvidia/temperature/component.go | 9 ++- .../accelerator/nvidia/temperature/config.go | 5 ++ .../nvidia/utilization/component.go | 9 ++- .../accelerator/nvidia/utilization/config.go | 5 ++ components/diagnose/diagnose.go | 6 +- components/diagnose/options.go | 45 ++++++++++++ components/diagnose/scan.go | 9 ++- config/config.go | 10 +++ config/default.go | 7 ++ config/op_options.go | 31 ++++++++ 61 files changed, 723 insertions(+), 93 deletions(-) create mode 100644 components/accelerator/nvidia/query/options.go create mode 100644 components/accelerator/nvidia/query/testdata/nvidia-smi.525.125.06.out.0.valid create mode 100644 components/accelerator/nvidia/query/testdata/nvidia-smi.550.90.07.out.0.valid diff --git a/cmd/gpud/command/command.go b/cmd/gpud/command/command.go index a493e06d..eb5a6383 100644 --- a/cmd/gpud/command/command.go +++ b/cmd/gpud/command/command.go @@ -53,6 +53,11 @@ var ( dockerIgnoreConnectionErrors bool kubeletIgnoreConnectionErrors bool + + nvidiaSMICommand string + nvidiaSMIQueryCommand string + ibstatCommand string + infinibandClassDirectory string ) const ( @@ -70,7 +75,6 @@ func App() *cli.App { app.Description = "monitor your GPU/CPU machines and run workloads" app.Commands = []cli.Command{ - { Name: "login", Usage: "login gpud to lepton.ai (called automatically in gpud up with non-empty --token)", @@ -262,6 +266,32 @@ sudo rm /etc/systemd/system/gpud.service Usage: "ignore connection errors to kubelet read-only port, useful when kubelet readOnlyPort is disabled (default: false)", Destination: &kubeletIgnoreConnectionErrors, }, + + // only for testing + cli.StringFlag{ + Name: "nvidia-smi-command", + Usage: "sets the nvidia-smi command (leave empty for default, useful for testing)", + Destination: &nvidiaSMICommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "nvidia-smi-query-command", + Usage: "sets the nvidia-smi --query command (leave empty for default, useful for testing)", + Destination: &nvidiaSMIQueryCommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "ibstat-command", + Usage: "sets the ibstat command (leave empty for default, useful for testing)", + Destination: &ibstatCommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "infiniband-class-directory", + Usage: "sets the infiniband class directory (leave empty for default, useful for testing)", + Destination: &infinibandClassDirectory, + Hidden: true, + }, }, }, @@ -536,6 +566,32 @@ cat summary.txt Usage: "enable disk checks (default: true)", Destination: &diskcheck, }, + + // only for testing + cli.StringFlag{ + Name: "nvidia-smi-command", + Usage: "sets the nvidia-smi command (leave empty for default, useful for testing)", + Destination: &nvidiaSMICommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "nvidia-smi-query-command", + Usage: "sets the nvidia-smi --query command (leave empty for default, useful for testing)", + Destination: &nvidiaSMIQueryCommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "ibstat-command", + Usage: "sets the ibstat command (leave empty for default, useful for testing)", + Destination: &ibstatCommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "infiniband-class-directory", + Usage: "sets the infiniband class directory (leave empty for default, useful for testing)", + Destination: &infinibandClassDirectory, + Hidden: true, + }, }, }, { diff --git a/cmd/gpud/command/scan.go b/cmd/gpud/command/scan.go index 94b09d06..9287bc41 100644 --- a/cmd/gpud/command/scan.go +++ b/cmd/gpud/command/scan.go @@ -30,6 +30,10 @@ func cmdScan(cliContext *cli.Context) error { diagnose.WithPollGPMEvents(pollGPMEvents), diagnose.WithNetcheck(netcheck), diagnose.WithDiskcheck(diskcheck), + diagnose.WithNvidiaSMICommand(nvidiaSMICommand), + diagnose.WithNvidiaSMIQueryCommand(nvidiaSMIQueryCommand), + diagnose.WithIbstatCommand(ibstatCommand), + diagnose.WithInfinibandClassDirectory(infinibandClassDirectory), } if zapLvl.Level() <= zap.DebugLevel { // e.g., info, warn, error diagnoseOpts = append(diagnoseOpts, diagnose.WithDebug(true)) diff --git a/components/accelerator/nvidia/bad-envs/component.go b/components/accelerator/nvidia/bad-envs/component.go index 96f2c0d7..12cf5444 100644 --- a/components/accelerator/nvidia/bad-envs/component.go +++ b/components/accelerator/nvidia/bad-envs/component.go @@ -17,7 +17,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, bad_envs_id.Name) return &component{ diff --git a/components/accelerator/nvidia/bad-envs/config.go b/components/accelerator/nvidia/bad-envs/config.go index d8398fab..725989d7 100644 --- a/components/accelerator/nvidia/bad-envs/config.go +++ b/components/accelerator/nvidia/bad-envs/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/clock-speed/component.go b/components/accelerator/nvidia/clock-speed/component.go index 8c77185d..e90df11c 100644 --- a/components/accelerator/nvidia/clock-speed/component.go +++ b/components/accelerator/nvidia/clock-speed/component.go @@ -21,7 +21,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_clock_speed_id.Name) return &component{ diff --git a/components/accelerator/nvidia/clock-speed/config.go b/components/accelerator/nvidia/clock-speed/config.go index b985096b..f9f1192a 100644 --- a/components/accelerator/nvidia/clock-speed/config.go +++ b/components/accelerator/nvidia/clock-speed/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/ecc/component.go b/components/accelerator/nvidia/ecc/component.go index 9b6340d9..7224ff2a 100644 --- a/components/accelerator/nvidia/ecc/component.go +++ b/components/accelerator/nvidia/ecc/component.go @@ -21,7 +21,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_ecc_id.Name) return &component{ diff --git a/components/accelerator/nvidia/ecc/config.go b/components/accelerator/nvidia/ecc/config.go index 4080e02b..5f6b0072 100644 --- a/components/accelerator/nvidia/ecc/config.go +++ b/components/accelerator/nvidia/ecc/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/error-xid-sxid/component.go b/components/accelerator/nvidia/error-xid-sxid/component.go index f40e0444..d137aa40 100644 --- a/components/accelerator/nvidia/error-xid-sxid/component.go +++ b/components/accelerator/nvidia/error-xid-sxid/component.go @@ -23,7 +23,14 @@ func New(ctx context.Context, cfg Config) components.Component { // this starts the Xid poller via "nvml.StartDefaultInstance" cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_error_xid_sxid_id.Name) return &component{ diff --git a/components/accelerator/nvidia/error-xid-sxid/config.go b/components/accelerator/nvidia/error-xid-sxid/config.go index 3fe6dabd..2e45bbad 100644 --- a/components/accelerator/nvidia/error-xid-sxid/config.go +++ b/components/accelerator/nvidia/error-xid-sxid/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/error/component.go b/components/accelerator/nvidia/error/component.go index cb29196d..6866abef 100644 --- a/components/accelerator/nvidia/error/component.go +++ b/components/accelerator/nvidia/error/component.go @@ -18,7 +18,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/error/config.go b/components/accelerator/nvidia/error/config.go index 755e7cd8..a6c9c1a8 100644 --- a/components/accelerator/nvidia/error/config.go +++ b/components/accelerator/nvidia/error/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/error/xid/config.go b/components/accelerator/nvidia/error/xid/config.go index 1587e2a7..1ea39cdb 100644 --- a/components/accelerator/nvidia/error/xid/config.go +++ b/components/accelerator/nvidia/error/xid/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/fabric-manager/component.go b/components/accelerator/nvidia/fabric-manager/component.go index 0ace8b58..8c9dcbc8 100644 --- a/components/accelerator/nvidia/fabric-manager/component.go +++ b/components/accelerator/nvidia/fabric-manager/component.go @@ -21,7 +21,14 @@ func New(ctx context.Context, cfg Config) (components.Component, error) { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Log.Query.State.DBRW, cfg.Log.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Log.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Log.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) if err := cfg.Log.Validate(); err != nil { diff --git a/components/accelerator/nvidia/fabric-manager/config.go b/components/accelerator/nvidia/fabric-manager/config.go index a6ba3328..9b5ecdb0 100644 --- a/components/accelerator/nvidia/fabric-manager/config.go +++ b/components/accelerator/nvidia/fabric-manager/config.go @@ -15,6 +15,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` Log query_log_config.Config `json:"log"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/gpm/config.go b/components/accelerator/nvidia/gpm/config.go index 88faa6bc..f80647f6 100644 --- a/components/accelerator/nvidia/gpm/config.go +++ b/components/accelerator/nvidia/gpm/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/gsp-firmware-mode/component.go b/components/accelerator/nvidia/gsp-firmware-mode/component.go index 0b41d495..0a1f2915 100644 --- a/components/accelerator/nvidia/gsp-firmware-mode/component.go +++ b/components/accelerator/nvidia/gsp-firmware-mode/component.go @@ -17,7 +17,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_gsp_firmware_mode_id.Name) return &component{ diff --git a/components/accelerator/nvidia/gsp-firmware-mode/config.go b/components/accelerator/nvidia/gsp-firmware-mode/config.go index b2a61c8b..461794d6 100644 --- a/components/accelerator/nvidia/gsp-firmware-mode/config.go +++ b/components/accelerator/nvidia/gsp-firmware-mode/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/hw-slowdown/component.go b/components/accelerator/nvidia/hw-slowdown/component.go index a131ea2f..a52df785 100644 --- a/components/accelerator/nvidia/hw-slowdown/component.go +++ b/components/accelerator/nvidia/hw-slowdown/component.go @@ -24,7 +24,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_hw_slowdown_id.Name) return &component{ diff --git a/components/accelerator/nvidia/hw-slowdown/config.go b/components/accelerator/nvidia/hw-slowdown/config.go index 522632af..67ac181b 100644 --- a/components/accelerator/nvidia/hw-slowdown/config.go +++ b/components/accelerator/nvidia/hw-slowdown/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/infiniband/component.go b/components/accelerator/nvidia/infiniband/component.go index 842407f7..fe4066db 100644 --- a/components/accelerator/nvidia/infiniband/component.go +++ b/components/accelerator/nvidia/infiniband/component.go @@ -18,7 +18,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_infiniband_id.Name) return &component{ diff --git a/components/accelerator/nvidia/infiniband/config.go b/components/accelerator/nvidia/infiniband/config.go index d342edb7..66472d36 100644 --- a/components/accelerator/nvidia/infiniband/config.go +++ b/components/accelerator/nvidia/infiniband/config.go @@ -11,6 +11,11 @@ type Config struct { Query query_config.Config `json:"query"` ExpectedPortStates + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } // Configures the expected state of the ports. diff --git a/components/accelerator/nvidia/info/component.go b/components/accelerator/nvidia/info/component.go index 255b2545..aaea61a2 100644 --- a/components/accelerator/nvidia/info/component.go +++ b/components/accelerator/nvidia/info/component.go @@ -18,7 +18,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/info/config.go b/components/accelerator/nvidia/info/config.go index 28ec8612..54e920a8 100644 --- a/components/accelerator/nvidia/info/config.go +++ b/components/accelerator/nvidia/info/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/memory/component.go b/components/accelerator/nvidia/memory/component.go index ae17686f..340f2c7e 100644 --- a/components/accelerator/nvidia/memory/component.go +++ b/components/accelerator/nvidia/memory/component.go @@ -22,7 +22,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/memory/config.go b/components/accelerator/nvidia/memory/config.go index 31be6df5..50abcd9d 100644 --- a/components/accelerator/nvidia/memory/config.go +++ b/components/accelerator/nvidia/memory/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/nccl/component.go b/components/accelerator/nvidia/nccl/component.go index cca3a5d6..9fbc5d2d 100644 --- a/components/accelerator/nvidia/nccl/component.go +++ b/components/accelerator/nvidia/nccl/component.go @@ -20,7 +20,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_nccl_id.Name) return &component{ diff --git a/components/accelerator/nvidia/nccl/config.go b/components/accelerator/nvidia/nccl/config.go index 3e9c02ba..d37c8304 100644 --- a/components/accelerator/nvidia/nccl/config.go +++ b/components/accelerator/nvidia/nccl/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/nvlink/component.go b/components/accelerator/nvidia/nvlink/component.go index b466377b..09cd3bd2 100644 --- a/components/accelerator/nvidia/nvlink/component.go +++ b/components/accelerator/nvidia/nvlink/component.go @@ -22,7 +22,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/nvlink/config.go b/components/accelerator/nvidia/nvlink/config.go index f8804eb3..d373f0ae 100644 --- a/components/accelerator/nvidia/nvlink/config.go +++ b/components/accelerator/nvidia/nvlink/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/peermem/component.go b/components/accelerator/nvidia/peermem/component.go index 969e3b1d..74dec9b5 100644 --- a/components/accelerator/nvidia/peermem/component.go +++ b/components/accelerator/nvidia/peermem/component.go @@ -20,7 +20,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_peermem_id.Name) return &component{ diff --git a/components/accelerator/nvidia/peermem/config.go b/components/accelerator/nvidia/peermem/config.go index 1c394ab0..2c73fb96 100644 --- a/components/accelerator/nvidia/peermem/config.go +++ b/components/accelerator/nvidia/peermem/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/persistence-mode/component.go b/components/accelerator/nvidia/persistence-mode/component.go index 67687588..a4e7452b 100644 --- a/components/accelerator/nvidia/persistence-mode/component.go +++ b/components/accelerator/nvidia/persistence-mode/component.go @@ -17,7 +17,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_persistence_mode_id.Name) return &component{ diff --git a/components/accelerator/nvidia/persistence-mode/config.go b/components/accelerator/nvidia/persistence-mode/config.go index 70ce1f8f..e6086655 100644 --- a/components/accelerator/nvidia/persistence-mode/config.go +++ b/components/accelerator/nvidia/persistence-mode/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/power/component.go b/components/accelerator/nvidia/power/component.go index e5708b26..18108b26 100644 --- a/components/accelerator/nvidia/power/component.go +++ b/components/accelerator/nvidia/power/component.go @@ -21,7 +21,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_power_id.Name) return &component{ diff --git a/components/accelerator/nvidia/power/config.go b/components/accelerator/nvidia/power/config.go index 0d7ae6a7..f243e187 100644 --- a/components/accelerator/nvidia/power/config.go +++ b/components/accelerator/nvidia/power/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/processes/component.go b/components/accelerator/nvidia/processes/component.go index a9d9c09f..727717c9 100644 --- a/components/accelerator/nvidia/processes/component.go +++ b/components/accelerator/nvidia/processes/component.go @@ -22,7 +22,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/processes/config.go b/components/accelerator/nvidia/processes/config.go index a2fae1a3..1957066f 100644 --- a/components/accelerator/nvidia/processes/config.go +++ b/components/accelerator/nvidia/processes/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/query/infiniband/ibstat.go b/components/accelerator/nvidia/query/infiniband/ibstat.go index 41755e2a..89bdff6c 100644 --- a/components/accelerator/nvidia/query/infiniband/ibstat.go +++ b/components/accelerator/nvidia/query/infiniband/ibstat.go @@ -5,24 +5,51 @@ import ( "context" "errors" "fmt" - "os/exec" "strings" "github.com/leptonai/gpud/log" + "github.com/leptonai/gpud/pkg/process" "sigs.k8s.io/yaml" ) -func RunIbstat(ctx context.Context) (*IbstatOutput, error) { - p, err := exec.LookPath("ibstat") - if err != nil { - return nil, fmt.Errorf("ibstat not found (%w)", err) +func GetIbstatOutput(ctx context.Context, ibstatCommands []string) (*IbstatOutput, error) { + if len(ibstatCommands) == 0 { + ibstatCommands = []string{"ibstat"} } - b, err := exec.CommandContext(ctx, p).CombinedOutput() + + p, err := process.New( + process.WithCommand(ibstatCommands...), + process.WithRunAsBashScript(), + ) if err != nil { return nil, err } + + if err := p.Start(ctx); err != nil { + return nil, err + } + defer func() { + if err := p.Close(ctx); err != nil { + log.Logger.Warnw("failed to abort command", "err", err) + } + }() + + lines := make([]string, 0) + if err := process.Read( + ctx, + p, + process.WithReadStdout(), + process.WithReadStderr(), + process.WithProcessLine(func(line string) { + lines = append(lines, line) + }), + process.WithWaitForCmd(), + ); err != nil { + return nil, fmt.Errorf("failed to read ibstat output: %w\n\noutput:\n%s", err, strings.Join(lines, "\n")) + } + o := &IbstatOutput{ - Raw: string(b), + Raw: strings.Join(lines, "\n"), } // TODO: once stable return error diff --git a/components/accelerator/nvidia/query/infiniband/ibstat_test.go b/components/accelerator/nvidia/query/infiniband/ibstat_test.go index 39f47e3d..33eb2bb7 100644 --- a/components/accelerator/nvidia/query/infiniband/ibstat_test.go +++ b/components/accelerator/nvidia/query/infiniband/ibstat_test.go @@ -1,13 +1,40 @@ package infiniband import ( + "context" "errors" "os" "path/filepath" "reflect" "testing" + "time" ) +func TestGetIbstatOutput(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + // iterate all files in "testdata/" + matches, err := filepath.Glob("testdata/ibstat.*") + if err != nil { + t.Fatalf("failed to glob: %v", err) + } + + for _, queryFile := range matches { + o, err := GetIbstatOutput( + ctx, + []string{"cat", queryFile}, + ) + if err != nil { + t.Fatal(err) + } + + o.Raw = "" + + t.Logf("%q:\n%+v", queryFile, o) + } +} + func TestParseIBStat(t *testing.T) { input := `CA 'mlx5_0' CA type: MT4129 diff --git a/components/accelerator/nvidia/query/infiniband/infiniband.go b/components/accelerator/nvidia/query/infiniband/infiniband.go index 46d8a8eb..4469232c 100644 --- a/components/accelerator/nvidia/query/infiniband/infiniband.go +++ b/components/accelerator/nvidia/query/infiniband/infiniband.go @@ -5,7 +5,6 @@ import ( "context" "fmt" "os" - "os/exec" "strings" "github.com/leptonai/gpud/log" @@ -34,14 +33,6 @@ func SupportsInfinibandPortRate(gpuProductName string) int { return 0 } -func IbstatExists() bool { - p, err := exec.LookPath("ibstat") - if err != nil { - return false - } - return p != "" -} - // lspci | grep -i infiniband // 1a:00.0 Infiniband controller: Mellanox Technologies MT2910 Family [ConnectX-7] // 3c:00.0 Infiniband controller: Mellanox Technologies MT2910 Family [ConnectX-7] @@ -106,18 +97,11 @@ func CountInfinibandPCIBuses(ctx context.Context) (int, error) { // Counts the directories in "/sys/class/infiniband". // Returns 0 if the directory does not exist. func CountInfinibandClass() int { - info, err := os.Stat("/sys/class/infiniband") - if err != nil || !info.IsDir() { - return 0 - } - dirs, err := os.ReadDir("/sys/class/infiniband") - if err != nil { - return 0 - } - return len(dirs) + return CountInfinibandClassBySubDir("/sys/class/infiniband") } -func countInfinibandClass(dir string) int { +// Count the sub-directories under the specified directory. +func CountInfinibandClassBySubDir(dir string) int { info, err := os.Stat(dir) if err != nil || !info.IsDir() { return 0 diff --git a/components/accelerator/nvidia/query/infiniband/infiniband_test.go b/components/accelerator/nvidia/query/infiniband/infiniband_test.go index a8bc8324..f54337d3 100644 --- a/components/accelerator/nvidia/query/infiniband/infiniband_test.go +++ b/components/accelerator/nvidia/query/infiniband/infiniband_test.go @@ -5,7 +5,7 @@ import ( "testing" ) -func TestCountInfinibandClass(t *testing.T) { +func TestCountInfinibandClassBySubDir(t *testing.T) { t.Parallel() tests := []struct { @@ -58,7 +58,7 @@ func TestCountInfinibandClass(t *testing.T) { } } - got := countInfinibandClass(tt.dirPath) + got := CountInfinibandClassBySubDir(tt.dirPath) if got != tt.want { t.Errorf("countInfinibandClass() = %v, want %v", got, tt.want) } diff --git a/components/accelerator/nvidia/query/nvidia_smi_query.go b/components/accelerator/nvidia/query/nvidia_smi_query.go index e118652a..7c415089 100644 --- a/components/accelerator/nvidia/query/nvidia_smi_query.go +++ b/components/accelerator/nvidia/query/nvidia_smi_query.go @@ -9,6 +9,7 @@ import ( "fmt" "sort" "strings" + "sync" metrics_clock_events_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/clock-events-state" "github.com/leptonai/gpud/log" @@ -25,22 +26,16 @@ func SMIExists() bool { return err == nil } -func RunSMI(ctx context.Context, args ...string) ([]byte, error) { - log.Logger.Debugw("finding nvidia-smi") - nvidiaSMIPath, err := file.LocateExecutable("nvidia-smi") - if err != nil { - return nil, fmt.Errorf("nvidia-smi not found (%w)", err) - } - +func RunSMI(ctx context.Context, commandArgs []string) ([]byte, error) { p, err := process.New( - process.WithCommand(append([]string{nvidiaSMIPath}, args...)...), + process.WithCommand(commandArgs...), process.WithRunAsBashScript(), ) if err != nil { return nil, err } - log.Logger.Debugw("starting nvidia-smi", "args", args) + log.Logger.Debugw("starting nvidia-smi", "args", commandArgs) if err := p.Start(ctx); err != nil { return nil, err } @@ -71,6 +66,8 @@ func RunSMI(ctx context.Context, args ...string) ([]byte, error) { // [Sat Oct 12 18:38:44 2024] _nv042330rm+0x10/0x40 [nvidia] // [Sat Oct 12 18:38:44 2024] ? _nv043429rm+0x23c/0x290 errc := make(chan error, 1) + + mu := sync.Mutex{} lines := make([]string, 0) go func() { err := process.Read( @@ -79,35 +76,40 @@ func RunSMI(ctx context.Context, args ...string) ([]byte, error) { process.WithReadStdout(), process.WithReadStderr(), process.WithProcessLine(func(line string) { + mu.Lock() lines = append(lines, line) + mu.Unlock() }), process.WithWaitForCmd(), ) errc <- err }() - partialOutputErr := "" - if len(lines) > 0 { - partialOutputErr = fmt.Sprintf("\n\n(partial) output:\n%s", strings.Join(lines, "\n")) - } - select { case <-ctx.Done(): - return nil, fmt.Errorf("nvidia-smi command timed out: %w%s", ctx.Err(), partialOutputErr) + mu.Lock() + lineOutput := strings.Join(lines, "\n") + mu.Unlock() + + return nil, fmt.Errorf("nvidia-smi command timed out: %w\n\n(partial) output:\n%s", ctx.Err(), lineOutput) case err := <-errc: + mu.Lock() + lineOutput := strings.Join(lines, "\n") + mu.Unlock() + if err != nil { - return nil, fmt.Errorf("nvidia-smi command failed: %w%s", err, partialOutputErr) + return nil, fmt.Errorf("nvidia-smi command timed out: %w\n\n(partial) output:\n%s", err, lineOutput) } - return []byte(strings.Join(lines, "\n")), nil + return []byte(lineOutput), nil } } // Make sure to call this with a timeout, as a broken GPU may block the command. // e.g., // nvAssertOkFailedNoLog: Assertion failed: Call timed out [NV_ERR_TIMEOUT] (0x00000065) returned from pRmApi->Control(pRmApi, RES_GET_CLIENT_HANDLE(pKernelChannel), RES_GET_HANDLE(pKernelChannel), -func GetSMIOutput(ctx context.Context) (*SMIOutput, error) { - qb, err := RunSMI(ctx, "--query") +func GetSMIOutput(ctx context.Context, smiCmds []string, smiQueryCmds []string) (*SMIOutput, error) { + qb, err := RunSMI(ctx, smiQueryCmds) if err != nil { return nil, err } @@ -117,7 +119,7 @@ func GetSMIOutput(ctx context.Context) (*SMIOutput, error) { return nil, err } - sb, err := RunSMI(ctx) + sb, err := RunSMI(ctx, smiCmds) if err != nil { if IsErrDeviceHandleUnknownError(err) { o.SummaryFailure = err diff --git a/components/accelerator/nvidia/query/nvidia_smi_query_test.go b/components/accelerator/nvidia/query/nvidia_smi_query_test.go index cb6c50c3..beb8c27e 100644 --- a/components/accelerator/nvidia/query/nvidia_smi_query_test.go +++ b/components/accelerator/nvidia/query/nvidia_smi_query_test.go @@ -1,12 +1,44 @@ package query import ( + "context" "os" "path/filepath" "reflect" "testing" + "time" ) +func TestGetSMIOutput(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + // iterate all files in "testdata/" + matches, err := filepath.Glob("testdata/nvidia-smi-query.*.out.*.valid") + if err != nil { + t.Fatalf("failed to glob: %v", err) + } + + for _, queryFile := range matches { + o, err := GetSMIOutput( + ctx, + []string{"cat", "testdata/nvidia-smi.550.90.07.out.0.valid"}, + []string{"cat", queryFile}, + ) + if err != nil { + // TODO: fix + // CI can be flaky due to "cat" output being different + t.Logf("%q: %v", queryFile, err) + continue + } + + o.Raw = "" + o.Summary = "" + + t.Logf("%q:\n%+v", queryFile, o) + } +} + func TestParse4090Valid(t *testing.T) { data, err := os.ReadFile("testdata/nvidia-smi-query.535.154.05.out.0.valid.4090") if err != nil { diff --git a/components/accelerator/nvidia/query/options.go b/components/accelerator/nvidia/query/options.go new file mode 100644 index 00000000..9458bdfa --- /dev/null +++ b/components/accelerator/nvidia/query/options.go @@ -0,0 +1,73 @@ +package query + +import "database/sql" + +type Op struct { + dbRW *sql.DB + dbRO *sql.DB + nvidiaSMICommand string + nvidiaSMIQueryCommand string + ibstatCommand string + infinibandClassDirectory string +} + +type OpOption func(*Op) + +func (op *Op) applyOpts(opts []OpOption) error { + for _, opt := range opts { + opt(op) + } + + if op.nvidiaSMICommand == "" { + op.nvidiaSMICommand = "nvidia-smi" + } + if op.nvidiaSMIQueryCommand == "" { + op.nvidiaSMIQueryCommand = "nvidia-smi --query" + } + if op.ibstatCommand == "" { + op.ibstatCommand = "ibstat" + } + if op.infinibandClassDirectory == "" { + op.infinibandClassDirectory = "/sys/class/infiniband" + } + + return nil +} + +func WithDBRW(db *sql.DB) OpOption { + return func(op *Op) { + op.dbRW = db + } +} + +func WithDBRO(db *sql.DB) OpOption { + return func(op *Op) { + op.dbRO = db + } +} + +// Specifies the nvidia-smi binary path to overwrite the default path. +func WithNvidiaSMICommand(p string) OpOption { + return func(op *Op) { + op.nvidiaSMICommand = p + } +} + +func WithNvidiaSMIQueryCommand(p string) OpOption { + return func(op *Op) { + op.nvidiaSMIQueryCommand = p + } +} + +// Specifies the ibstat binary path to overwrite the default path. +func WithIbstatCommand(p string) OpOption { + return func(op *Op) { + op.ibstatCommand = p + } +} + +func WithInfinibandClassDirectory(p string) OpOption { + return func(op *Op) { + op.infinibandClassDirectory = p + } +} diff --git a/components/accelerator/nvidia/query/peermem/peermem.go b/components/accelerator/nvidia/query/peermem/peermem.go index b1984354..28dccc06 100644 --- a/components/accelerator/nvidia/query/peermem/peermem.go +++ b/components/accelerator/nvidia/query/peermem/peermem.go @@ -8,7 +8,6 @@ import ( "strings" "time" - "github.com/leptonai/gpud/components/accelerator/nvidia/query/infiniband" "github.com/leptonai/gpud/log" "github.com/leptonai/gpud/pkg/process" ) @@ -66,9 +65,7 @@ func CheckLsmodPeermemModule(ctx context.Context) (*LsmodPeermemModuleOutput, er } o := &LsmodPeermemModuleOutput{ - IbstatExists: infiniband.IbstatExists(), - InfinibandClassExists: infiniband.CountInfinibandClass() > 0, - Raw: strings.Join(lines, "\n"), + Raw: strings.Join(lines, "\n"), } o.IbcoreUsingPeermemModule = HasLsmodInfinibandPeerMem(o.Raw) @@ -102,8 +99,6 @@ func HasLsmodInfinibandPeerMem(lsmodOutput string) bool { } type LsmodPeermemModuleOutput struct { - IbstatExists bool `json:"ibstat_exists"` - InfinibandClassExists bool `json:"infiniband_class_exists"` Raw string `json:"raw"` IbcoreUsingPeermemModule bool `json:"ibcore_using_peermem_module"` } diff --git a/components/accelerator/nvidia/query/query.go b/components/accelerator/nvidia/query/query.go index c1a76bc0..e58d80fb 100644 --- a/components/accelerator/nvidia/query/query.go +++ b/components/accelerator/nvidia/query/query.go @@ -4,9 +4,9 @@ package query import ( "context" - "database/sql" "fmt" "os" + "strings" "sync" "time" @@ -28,6 +28,7 @@ import ( query_config "github.com/leptonai/gpud/components/query/config" "github.com/leptonai/gpud/components/systemd" "github.com/leptonai/gpud/log" + "github.com/leptonai/gpud/pkg/file" go_nvml "github.com/NVIDIA/go-nvml/pkg/nvml" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -40,7 +41,7 @@ var ( ) // only set once since it relies on the kube client and specific port -func SetDefaultPoller(dbRW *sql.DB, dbRO *sql.DB) { +func SetDefaultPoller(opts ...OpOption) { defaultPollerOnce.Do(func() { defaultPoller = query.New( "shared-nvidia-poller", @@ -51,7 +52,7 @@ func SetDefaultPoller(dbRW *sql.DB, dbRO *sql.DB) { Retention: metav1.Duration{Duration: query_config.DefaultStateRetention}, }, }, - CreateGet(dbRW, dbRO), + CreateGet(opts...), nil, ) }) @@ -70,21 +71,26 @@ func GetSuccessOnce() <-chan any { return getSuccessOnce } -func CreateGet(dbRW *sql.DB, dbRO *sql.DB) query.GetFunc { +func CreateGet(opts ...OpOption) query.GetFunc { return func(ctx context.Context) (_ any, e error) { // "ctx" here is the root level and used for instantiating the "shared" NVML instance "once" // and all other sub-calls have its own context timeouts, thus we do not set the timeout here // otherwise, we will cancel all future operations when the instance is created only once! - return Get(ctx, dbRW, dbRO) + return Get(ctx, opts...) } } // Get all nvidia component queries. -func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error) { +func Get(ctx context.Context, opts ...OpOption) (output any, err error) { + op := &Op{} + if err := op.applyOpts(opts); err != nil { + return nil, fmt.Errorf("failed to apply options: %w", err) + } + if err := nvml.StartDefaultInstance( ctx, - nvml.WithDBRW(dbRW), - nvml.WithDBRO(dbRO), + nvml.WithDBRW(op.dbRW), + nvml.WithDBRO(op.dbRO), nvml.WithGPMMetricsID( go_nvml.GPM_METRIC_SM_OCCUPANCY, go_nvml.GPM_METRIC_INTEGER_UTIL, @@ -100,12 +106,20 @@ func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error return nil, fmt.Errorf("failed to start nvml instance: %w", err) } + p, err := file.LocateExecutable(strings.Split(op.nvidiaSMICommand, " ")[0]) + smiExists := err == nil && p != "" + + p, err = file.LocateExecutable(strings.Split(op.ibstatCommand, " ")[0]) + ibstatExists := err == nil && p != "" + + ibClassCount := infiniband.CountInfinibandClassBySubDir(op.infinibandClassDirectory) + o := &Output{ Time: time.Now().UTC(), - SMIExists: SMIExists(), + SMIExists: smiExists, FabricManagerExists: FabricManagerExists(), - InfinibandClassExists: infiniband.CountInfinibandClass() > 0, - IbstatExists: infiniband.IbstatExists(), + InfinibandClassExists: ibClassCount > 0, + IbstatExists: ibstatExists, } log.Logger.Debugw("counting gpu devices") @@ -175,9 +189,9 @@ func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error } if o.InfinibandClassExists && o.IbstatExists { - log.Logger.Debugw("running ibstat") + log.Logger.Debugw("running ibstat", "command", op.ibstatCommand) cctx, ccancel := context.WithTimeout(ctx, 30*time.Second) - o.Ibstat, err = infiniband.RunIbstat(cctx) + o.Ibstat, err = infiniband.GetIbstatOutput(cctx, []string{op.ibstatCommand}) ccancel() if err != nil { if o.Ibstat == nil { @@ -248,7 +262,10 @@ func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error if o.SMIExists { // call this with a timeout, as a broken GPU may block the command. cctx, ccancel := context.WithTimeout(ctx, 2*time.Minute) - o.SMI, err = GetSMIOutput(cctx) + o.SMI, err = GetSMIOutput(cctx, + []string{op.nvidiaSMICommand}, + []string{op.nvidiaSMIQueryCommand}, + ) ccancel() if err != nil { o.SMIQueryErrors = append(o.SMIQueryErrors, err.Error()) @@ -265,7 +282,7 @@ func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error events := o.SMI.HWSlowdownEvents(truncNowUTC.Unix()) for _, event := range events { cctx, ccancel = context.WithTimeout(ctx, time.Minute) - found, err := metrics_clock_events_state.FindEvent(cctx, dbRO, event) + found, err := metrics_clock_events_state.FindEvent(cctx, op.dbRO, event) ccancel() if err != nil { o.SMIQueryErrors = append(o.SMIQueryErrors, fmt.Sprintf("failed to find clock events: %v", err)) @@ -275,7 +292,7 @@ func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error continue } cctx, ccancel = context.WithTimeout(ctx, time.Minute) - err = metrics_clock_events_state.InsertEvent(cctx, dbRW, event) + err = metrics_clock_events_state.InsertEvent(cctx, op.dbRW, event) ccancel() if err != nil { o.SMIQueryErrors = append(o.SMIQueryErrors, fmt.Sprintf("failed to persist clock events: %v", err)) diff --git a/components/accelerator/nvidia/query/testdata/nvidia-smi.525.125.06.out.0.valid b/components/accelerator/nvidia/query/testdata/nvidia-smi.525.125.06.out.0.valid new file mode 100644 index 00000000..22fd2fed --- /dev/null +++ b/components/accelerator/nvidia/query/testdata/nvidia-smi.525.125.06.out.0.valid @@ -0,0 +1,11 @@ ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 525.125.06 Driver Version: 525.125.06 CUDA Version: 12.0 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|===============================+======================+======================| +| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | 0 | +|ERR! 38C P5 49W / 450W | 2021MiB / 23028MiB | 0% E. Process | +| | | N/A | ++-------------------------------+----------------------+----------------------+ \ No newline at end of file diff --git a/components/accelerator/nvidia/query/testdata/nvidia-smi.550.90.07.out.0.valid b/components/accelerator/nvidia/query/testdata/nvidia-smi.550.90.07.out.0.valid new file mode 100644 index 00000000..9d45fc68 --- /dev/null +++ b/components/accelerator/nvidia/query/testdata/nvidia-smi.550.90.07.out.0.valid @@ -0,0 +1,48 @@ +Mon Jan 20 10:18:26 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.90.07 Driver Version: 550.90.07 CUDA Version: 12.4 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 27C P0 72W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 24C P0 74W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 23C P0 70W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 23C P0 72W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 27C P0 73W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 22C P0 74W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 24C P0 73W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 23C P0 72W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ \ No newline at end of file diff --git a/components/accelerator/nvidia/remapped-rows/component.go b/components/accelerator/nvidia/remapped-rows/component.go index b9eba052..d6db37f3 100644 --- a/components/accelerator/nvidia/remapped-rows/component.go +++ b/components/accelerator/nvidia/remapped-rows/component.go @@ -22,7 +22,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/remapped-rows/config.go b/components/accelerator/nvidia/remapped-rows/config.go index c298ffbd..90c7a987 100644 --- a/components/accelerator/nvidia/remapped-rows/config.go +++ b/components/accelerator/nvidia/remapped-rows/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/temperature/component.go b/components/accelerator/nvidia/temperature/component.go index 0816b0e7..9128232d 100644 --- a/components/accelerator/nvidia/temperature/component.go +++ b/components/accelerator/nvidia/temperature/component.go @@ -22,7 +22,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/temperature/config.go b/components/accelerator/nvidia/temperature/config.go index 04f371b1..ac6f3e6d 100644 --- a/components/accelerator/nvidia/temperature/config.go +++ b/components/accelerator/nvidia/temperature/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/utilization/component.go b/components/accelerator/nvidia/utilization/component.go index e5b7910e..bc581427 100644 --- a/components/accelerator/nvidia/utilization/component.go +++ b/components/accelerator/nvidia/utilization/component.go @@ -22,7 +22,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/utilization/config.go b/components/accelerator/nvidia/utilization/config.go index d60445ef..7d37233b 100644 --- a/components/accelerator/nvidia/utilization/config.go +++ b/components/accelerator/nvidia/utilization/config.go @@ -9,6 +9,11 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/diagnose/diagnose.go b/components/diagnose/diagnose.go index 9b3e212f..9c37ca47 100644 --- a/components/diagnose/diagnose.go +++ b/components/diagnose/diagnose.go @@ -283,7 +283,11 @@ func run(ctx context.Context, dir string, opts ...OpOption) error { }) } else { fmt.Printf("%s checking nvidia-smi output\n", inProgress) - nout, err := nvidia_query.GetSMIOutput(ctx) + nout, err := nvidia_query.GetSMIOutput( + ctx, + []string{"nvidia-smi"}, + []string{"nvidia-smi", "--query"}, + ) if err != nil { o.Results = append(o.Results, CommandResult{ Command: "nvidia-smi -q", diff --git a/components/diagnose/options.go b/components/diagnose/options.go index 69326bf3..23697a15 100644 --- a/components/diagnose/options.go +++ b/components/diagnose/options.go @@ -1,6 +1,11 @@ package diagnose type Op struct { + nvidiaSMICommand string + nvidiaSMIQueryCommand string + ibstatCommand string + infinibandClassDirectory string + lines int debug bool createArchive bool @@ -18,12 +23,52 @@ func (op *Op) applyOpts(opts []OpOption) error { for _, opt := range opts { opt(op) } + + if op.nvidiaSMICommand == "" { + op.nvidiaSMICommand = "nvidia-smi" + } + if op.nvidiaSMIQueryCommand == "" { + op.nvidiaSMIQueryCommand = "nvidia-smi --query" + } + if op.ibstatCommand == "" { + op.ibstatCommand = "ibstat" + } + if op.infinibandClassDirectory == "" { + op.infinibandClassDirectory = "/sys/class/infiniband" + } + if op.lines == 0 { op.lines = 100 } return nil } +// Specifies the nvidia-smi binary path to overwrite the default path. +func WithNvidiaSMICommand(p string) OpOption { + return func(op *Op) { + op.nvidiaSMICommand = p + } +} + +func WithNvidiaSMIQueryCommand(p string) OpOption { + return func(op *Op) { + op.nvidiaSMIQueryCommand = p + } +} + +// Specifies the ibstat binary path to overwrite the default path. +func WithIbstatCommand(p string) OpOption { + return func(op *Op) { + op.ibstatCommand = p + } +} + +func WithInfinibandClassDirectory(p string) OpOption { + return func(op *Op) { + op.infinibandClassDirectory = p + } +} + func WithLines(lines int) OpOption { return func(op *Op) { op.lines = lines diff --git a/components/diagnose/scan.go b/components/diagnose/scan.go index 1484fbd1..a6ae1bc2 100644 --- a/components/diagnose/scan.go +++ b/components/diagnose/scan.go @@ -126,7 +126,14 @@ func Scan(ctx context.Context, opts ...OpOption) error { } defer db.Close() - outputRaw, err := nvidia_query.Get(ctx, db, db) + outputRaw, err := nvidia_query.Get( + ctx, + nvidia_query.WithDBRW(db), + nvidia_query.WithDBRO(db), + nvidia_query.WithNvidiaSMICommand(op.nvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(op.nvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(op.ibstatCommand), + ) if err != nil { log.Logger.Warnw("error getting nvidia info", "error", err) } else { diff --git a/config/config.go b/config/config.go index 0acb88ac..d5f4619e 100644 --- a/config/config.go +++ b/config/config.go @@ -46,6 +46,9 @@ type Config struct { // Configures the local web configuration. Web *Web `json:"web,omitempty"` + // Overwrites the tool binaries for testing. + ToolOverwriteOptions ToolOverwriteOptions `json:"tool_overwrite_options"` + // Set false to disable auto update EnableAutoUpdate bool `json:"enable_auto_update"` @@ -70,6 +73,13 @@ type Web struct { SincePeriod metav1.Duration `json:"since_period"` } +type ToolOverwriteOptions struct { + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` +} + var ErrInvalidAutoUpdateExitCode = errors.New("auto_update_exit_code is only valid when auto_update is enabled") func (config *Config) Validate() error { diff --git a/config/default.go b/config/default.go index 2ba22b90..06ac5a79 100644 --- a/config/default.go +++ b/config/default.go @@ -157,6 +157,13 @@ func DefaultConfig(ctx context.Context, opts ...OpOption) (*Config, error) { SincePeriod: DefaultRetentionPeriod, }, + ToolOverwriteOptions: ToolOverwriteOptions{ + NvidiaSMICommand: options.NvidiaSMICommand, + NvidiaSMIQueryCommand: options.NvidiaSMIQueryCommand, + IbstatCommand: options.IbstatCommand, + InfinibandClassDirectory: options.InfinibandClassDirectory, + }, + EnableAutoUpdate: true, } diff --git a/config/op_options.go b/config/op_options.go index 2be8fed0..5fd58a53 100644 --- a/config/op_options.go +++ b/config/op_options.go @@ -8,6 +8,11 @@ type Op struct { ExpectedPortStates *infiniband.ExpectedPortStates DockerIgnoreConnectionErrors bool KubeletIgnoreConnectionErrors bool + + NvidiaSMICommand string + NvidiaSMIQueryCommand string + IbstatCommand string + InfinibandClassDirectory string } type OpOption func(*Op) @@ -49,3 +54,29 @@ func WithKubeletIgnoreConnectionErrors(b bool) OpOption { op.KubeletIgnoreConnectionErrors = b } } + +// Specifies the nvidia-smi binary path to overwrite the default path. +func WithNvidiaSMICommand(p string) OpOption { + return func(op *Op) { + op.NvidiaSMICommand = p + } +} + +func WithNvidiaSMIQueryCommand(p string) OpOption { + return func(op *Op) { + op.NvidiaSMIQueryCommand = p + } +} + +// Specifies the ibstat binary path to overwrite the default path. +func WithIbstatCommand(p string) OpOption { + return func(op *Op) { + op.IbstatCommand = p + } +} + +func WithInfinibandClassDirectory(p string) OpOption { + return func(op *Op) { + op.InfinibandClassDirectory = p + } +}