From 634c0b9d20dd0c76e840ffae42e6b2da7389e95f Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Tue, 21 Jan 2025 21:10:31 +0800 Subject: [PATCH 1/4] feat(nvidia): configurable nvidia-smi binary, ibstat binary, infiniband class dir paths for mock testing Signed-off-by: Gyuho Lee --- cmd/gpud/command/command.go | 57 +++++++++++- cmd/gpud/command/scan.go | 4 + .../accelerator/nvidia/bad-envs/component.go | 12 ++- .../nvidia/clock-speed/component.go | 12 ++- .../accelerator/nvidia/clock-speed/config.go | 33 ------- .../nvidia/{bad-envs => common}/config.go | 11 ++- .../accelerator/nvidia/ecc/component.go | 12 ++- components/accelerator/nvidia/ecc/config.go | 33 ------- .../nvidia/error-xid-sxid/component.go | 14 ++- .../nvidia/error-xid-sxid/config.go | 33 ------- .../accelerator/nvidia/error/component.go | 12 ++- components/accelerator/nvidia/error/config.go | 33 ------- .../accelerator/nvidia/error/xid/component.go | 3 +- .../nvidia/error/xid/component_output.go | 3 +- .../accelerator/nvidia/error/xid/config.go | 33 ------- .../nvidia/fabric-manager/component.go | 9 +- .../nvidia/fabric-manager/config.go | 2 + .../accelerator/nvidia/gpm/component.go | 3 +- .../nvidia/gpm/component_output.go | 3 +- components/accelerator/nvidia/gpm/config.go | 33 ------- .../nvidia/gsp-firmware-mode/component.go | 12 ++- .../nvidia/gsp-firmware-mode/config.go | 33 ------- .../nvidia/hw-slowdown/component.go | 12 ++- .../accelerator/nvidia/hw-slowdown/config.go | 33 ------- .../nvidia/infiniband/component.go | 9 +- .../accelerator/nvidia/infiniband/config.go | 3 + .../accelerator/nvidia/info/component.go | 12 ++- components/accelerator/nvidia/info/config.go | 33 ------- .../accelerator/nvidia/memory/component.go | 12 ++- .../accelerator/nvidia/memory/config.go | 33 ------- .../accelerator/nvidia/nccl/component.go | 12 ++- components/accelerator/nvidia/nccl/config.go | 33 ------- .../accelerator/nvidia/nvlink/component.go | 12 ++- .../accelerator/nvidia/nvlink/config.go | 33 ------- .../accelerator/nvidia/peermem/component.go | 12 ++- .../accelerator/nvidia/peermem/config.go | 33 ------- .../nvidia/persistence-mode/component.go | 12 ++- .../nvidia/persistence-mode/config.go | 33 ------- .../accelerator/nvidia/power/component.go | 12 ++- components/accelerator/nvidia/power/config.go | 33 ------- .../accelerator/nvidia/processes/component.go | 12 ++- .../accelerator/nvidia/processes/config.go | 33 ------- .../nvidia/query/infiniband/ibstat.go | 41 +++++++-- .../nvidia/query/infiniband/ibstat_test.go | 27 ++++++ .../nvidia/query/infiniband/infiniband.go | 22 +---- .../query/infiniband/infiniband_test.go | 4 +- .../nvidia/query/nvidia_smi_query.go | 42 +++++---- .../nvidia/query/nvidia_smi_query_test.go | 32 +++++++ .../accelerator/nvidia/query/nvml/nvml.go | 1 + .../accelerator/nvidia/query/options.go | 80 ++++++++++++++++ .../nvidia/query/peermem/peermem.go | 7 +- components/accelerator/nvidia/query/query.go | 68 ++++++++++---- .../nvidia-smi.525.125.06.out.0.valid | 11 +++ .../testdata/nvidia-smi.550.90.07.out.0.valid | 48 ++++++++++ .../nvidia/remapped-rows/component.go | 12 ++- .../nvidia/remapped-rows/config.go | 33 ------- .../nvidia/temperature/component.go | 12 ++- .../accelerator/nvidia/temperature/config.go | 33 ------- .../nvidia/utilization/component.go | 12 ++- .../accelerator/nvidia/utilization/config.go | 33 ------- components/diagnose/diagnose.go | 6 +- components/diagnose/options.go | 45 +++++++++ components/diagnose/scan.go | 13 ++- config/config.go | 10 ++ config/default.go | 7 ++ config/op_options.go | 33 ++++++- internal/server/server.go | 91 ++++++++++--------- 67 files changed, 750 insertions(+), 790 deletions(-) delete mode 100644 components/accelerator/nvidia/clock-speed/config.go rename components/accelerator/nvidia/{bad-envs => common}/config.go (64%) delete mode 100644 components/accelerator/nvidia/ecc/config.go delete mode 100644 components/accelerator/nvidia/error-xid-sxid/config.go delete mode 100644 components/accelerator/nvidia/error/config.go delete mode 100644 components/accelerator/nvidia/error/xid/config.go delete mode 100644 components/accelerator/nvidia/gpm/config.go delete mode 100644 components/accelerator/nvidia/gsp-firmware-mode/config.go delete mode 100644 components/accelerator/nvidia/hw-slowdown/config.go delete mode 100644 components/accelerator/nvidia/info/config.go delete mode 100644 components/accelerator/nvidia/memory/config.go delete mode 100644 components/accelerator/nvidia/nccl/config.go delete mode 100644 components/accelerator/nvidia/nvlink/config.go delete mode 100644 components/accelerator/nvidia/peermem/config.go delete mode 100644 components/accelerator/nvidia/persistence-mode/config.go delete mode 100644 components/accelerator/nvidia/power/config.go delete mode 100644 components/accelerator/nvidia/processes/config.go create mode 100644 components/accelerator/nvidia/query/options.go create mode 100644 components/accelerator/nvidia/query/testdata/nvidia-smi.525.125.06.out.0.valid create mode 100644 components/accelerator/nvidia/query/testdata/nvidia-smi.550.90.07.out.0.valid delete mode 100644 components/accelerator/nvidia/remapped-rows/config.go delete mode 100644 components/accelerator/nvidia/temperature/config.go delete mode 100644 components/accelerator/nvidia/utilization/config.go diff --git a/cmd/gpud/command/command.go b/cmd/gpud/command/command.go index 9ef937e4..1c807b9d 100644 --- a/cmd/gpud/command/command.go +++ b/cmd/gpud/command/command.go @@ -54,6 +54,11 @@ var ( dockerIgnoreConnectionErrors bool kubeletIgnoreConnectionErrors bool + + nvidiaSMICommand string + nvidiaSMIQueryCommand string + ibstatCommand string + infinibandClassDirectory string ) const ( @@ -71,7 +76,6 @@ func App() *cli.App { app.Description = "monitor your GPU/CPU machines and run workloads" app.Commands = []cli.Command{ - { Name: "login", Usage: "login gpud to lepton.ai (called automatically in gpud up with non-empty --token)", @@ -263,6 +267,32 @@ sudo rm /etc/systemd/system/gpud.service Usage: "ignore connection errors to kubelet read-only port, useful when kubelet readOnlyPort is disabled (default: false)", Destination: &kubeletIgnoreConnectionErrors, }, + + // only for testing + cli.StringFlag{ + Name: "nvidia-smi-command", + Usage: "sets the nvidia-smi command (leave empty for default, useful for testing)", + Destination: &nvidiaSMICommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "nvidia-smi-query-command", + Usage: "sets the nvidia-smi --query command (leave empty for default, useful for testing)", + Destination: &nvidiaSMIQueryCommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "ibstat-command", + Usage: "sets the ibstat command (leave empty for default, useful for testing)", + Destination: &ibstatCommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "infiniband-class-directory", + Usage: "sets the infiniband class directory (leave empty for default, useful for testing)", + Destination: &infinibandClassDirectory, + Hidden: true, + }, }, }, @@ -542,6 +572,31 @@ cat summary.txt Usage: "enable dmesg checks (default: true)", Destination: &dmesgCheck, }, + // only for testing + cli.StringFlag{ + Name: "nvidia-smi-command", + Usage: "sets the nvidia-smi command (leave empty for default, useful for testing)", + Destination: &nvidiaSMICommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "nvidia-smi-query-command", + Usage: "sets the nvidia-smi --query command (leave empty for default, useful for testing)", + Destination: &nvidiaSMIQueryCommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "ibstat-command", + Usage: "sets the ibstat command (leave empty for default, useful for testing)", + Destination: &ibstatCommand, + Hidden: true, + }, + cli.StringFlag{ + Name: "infiniband-class-directory", + Usage: "sets the infiniband class directory (leave empty for default, useful for testing)", + Destination: &infinibandClassDirectory, + Hidden: true, + }, }, }, { diff --git a/cmd/gpud/command/scan.go b/cmd/gpud/command/scan.go index 79ae598c..586800a3 100644 --- a/cmd/gpud/command/scan.go +++ b/cmd/gpud/command/scan.go @@ -31,6 +31,10 @@ func cmdScan(cliContext *cli.Context) error { diagnose.WithNetcheck(netcheck), diagnose.WithDiskcheck(diskcheck), diagnose.WithDmesgCheck(dmesgCheck), + diagnose.WithNvidiaSMICommand(nvidiaSMICommand), + diagnose.WithNvidiaSMIQueryCommand(nvidiaSMIQueryCommand), + diagnose.WithIbstatCommand(ibstatCommand), + diagnose.WithInfinibandClassDirectory(infinibandClassDirectory), } if zapLvl.Level() <= zap.DebugLevel { // e.g., info, warn, error diagnoseOpts = append(diagnoseOpts, diagnose.WithDebug(true)) diff --git a/components/accelerator/nvidia/bad-envs/component.go b/components/accelerator/nvidia/bad-envs/component.go index 96f2c0d7..5603785a 100644 --- a/components/accelerator/nvidia/bad-envs/component.go +++ b/components/accelerator/nvidia/bad-envs/component.go @@ -8,16 +8,24 @@ import ( "github.com/leptonai/gpud/components" bad_envs_id "github.com/leptonai/gpud/components/accelerator/nvidia/bad-envs/id" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" "github.com/leptonai/gpud/components/query" "github.com/leptonai/gpud/log" ) -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, bad_envs_id.Name) return &component{ diff --git a/components/accelerator/nvidia/clock-speed/component.go b/components/accelerator/nvidia/clock-speed/component.go index 8c77185d..baf4cf43 100644 --- a/components/accelerator/nvidia/clock-speed/component.go +++ b/components/accelerator/nvidia/clock-speed/component.go @@ -9,6 +9,7 @@ import ( "github.com/leptonai/gpud/components" nvidia_clock_speed_id "github.com/leptonai/gpud/components/accelerator/nvidia/clock-speed/id" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query_metrics_clockspeed "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/clock-speed" "github.com/leptonai/gpud/components/query" @@ -17,11 +18,18 @@ import ( "github.com/prometheus/client_golang/prometheus" ) -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_clock_speed_id.Name) return &component{ diff --git a/components/accelerator/nvidia/clock-speed/config.go b/components/accelerator/nvidia/clock-speed/config.go deleted file mode 100644 index b985096b..00000000 --- a/components/accelerator/nvidia/clock-speed/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package clockspeed - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/bad-envs/config.go b/components/accelerator/nvidia/common/config.go similarity index 64% rename from components/accelerator/nvidia/bad-envs/config.go rename to components/accelerator/nvidia/common/config.go index d8398fab..ee2cf4d0 100644 --- a/components/accelerator/nvidia/bad-envs/config.go +++ b/components/accelerator/nvidia/common/config.go @@ -1,4 +1,4 @@ -package badenvs +package common import ( "database/sql" @@ -9,6 +9,15 @@ import ( type Config struct { Query query_config.Config `json:"query"` + + ToolOverwrites +} + +type ToolOverwrites struct { + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/ecc/component.go b/components/accelerator/nvidia/ecc/component.go index 9b6340d9..90df889c 100644 --- a/components/accelerator/nvidia/ecc/component.go +++ b/components/accelerator/nvidia/ecc/component.go @@ -8,6 +8,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_ecc_id "github.com/leptonai/gpud/components/accelerator/nvidia/ecc/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query_metrics_ecc "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/ecc" @@ -17,11 +18,18 @@ import ( "github.com/prometheus/client_golang/prometheus" ) -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_ecc_id.Name) return &component{ diff --git a/components/accelerator/nvidia/ecc/config.go b/components/accelerator/nvidia/ecc/config.go deleted file mode 100644 index 4080e02b..00000000 --- a/components/accelerator/nvidia/ecc/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package ecc - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/error-xid-sxid/component.go b/components/accelerator/nvidia/error-xid-sxid/component.go index f40e0444..162b3628 100644 --- a/components/accelerator/nvidia/error-xid-sxid/component.go +++ b/components/accelerator/nvidia/error-xid-sxid/component.go @@ -9,6 +9,7 @@ import ( "github.com/dustin/go-humanize" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_error_xid_sxid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error-xid-sxid/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_xid_sxid_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid-sxid-state" @@ -18,12 +19,19 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() // this starts the Xid poller via "nvml.StartDefaultInstance" cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_error_xid_sxid_id.Name) return &component{ @@ -37,7 +45,7 @@ func New(ctx context.Context, cfg Config) components.Component { var _ components.Component = (*component)(nil) type component struct { - cfg Config + cfg nvidia_common.Config rootCtx context.Context cancel context.CancelFunc poller query.Poller diff --git a/components/accelerator/nvidia/error-xid-sxid/config.go b/components/accelerator/nvidia/error-xid-sxid/config.go deleted file mode 100644 index 3fe6dabd..00000000 --- a/components/accelerator/nvidia/error-xid-sxid/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package errorxidsxid - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/error/component.go b/components/accelerator/nvidia/error/component.go index cb29196d..53cecbdd 100644 --- a/components/accelerator/nvidia/error/component.go +++ b/components/accelerator/nvidia/error/component.go @@ -7,6 +7,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" "github.com/leptonai/gpud/components/query" "github.com/leptonai/gpud/log" @@ -14,11 +15,18 @@ import ( const Name = "accelerator-nvidia-error" -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/error/config.go b/components/accelerator/nvidia/error/config.go deleted file mode 100644 index 755e7cd8..00000000 --- a/components/accelerator/nvidia/error/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package error - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/error/xid/component.go b/components/accelerator/nvidia/error/xid/component.go index b19e304e..b1975bb9 100644 --- a/components/accelerator/nvidia/error/xid/component.go +++ b/components/accelerator/nvidia/error/xid/component.go @@ -9,6 +9,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_component_error_xid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/id" nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml" nvidia_query_xid "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid" @@ -17,7 +18,7 @@ import ( "github.com/leptonai/gpud/log" ) -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() setDefaultPoller(cfg) diff --git a/components/accelerator/nvidia/error/xid/component_output.go b/components/accelerator/nvidia/error/xid/component_output.go index db691f31..702d81f6 100644 --- a/components/accelerator/nvidia/error/xid/component_output.go +++ b/components/accelerator/nvidia/error/xid/component_output.go @@ -11,6 +11,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_component_error_xid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/id" nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml" nvidia_query_xid "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid" @@ -223,7 +224,7 @@ var ( ) // only set once since it relies on the kube client and specific port -func setDefaultPoller(cfg Config) { +func setDefaultPoller(cfg nvidia_common.Config) { defaultPollerOnce.Do(func() { defaultPoller = query.New( nvidia_component_error_xid_id.Name, diff --git a/components/accelerator/nvidia/error/xid/config.go b/components/accelerator/nvidia/error/xid/config.go deleted file mode 100644 index 1587e2a7..00000000 --- a/components/accelerator/nvidia/error/xid/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package xid - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/fabric-manager/component.go b/components/accelerator/nvidia/fabric-manager/component.go index 0ace8b58..8c9dcbc8 100644 --- a/components/accelerator/nvidia/fabric-manager/component.go +++ b/components/accelerator/nvidia/fabric-manager/component.go @@ -21,7 +21,14 @@ func New(ctx context.Context, cfg Config) (components.Component, error) { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Log.Query.State.DBRW, cfg.Log.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Log.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Log.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) if err := cfg.Log.Validate(); err != nil { diff --git a/components/accelerator/nvidia/fabric-manager/config.go b/components/accelerator/nvidia/fabric-manager/config.go index a6ba3328..a659b3b3 100644 --- a/components/accelerator/nvidia/fabric-manager/config.go +++ b/components/accelerator/nvidia/fabric-manager/config.go @@ -4,6 +4,7 @@ import ( "database/sql" "encoding/json" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" fabric_manager_log "github.com/leptonai/gpud/components/accelerator/nvidia/query/fabric-manager-log" query_config "github.com/leptonai/gpud/components/query/config" query_log_common "github.com/leptonai/gpud/components/query/log/common" @@ -15,6 +16,7 @@ import ( type Config struct { Query query_config.Config `json:"query"` Log query_log_config.Config `json:"log"` + nvidia_common.ToolOverwrites } func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { diff --git a/components/accelerator/nvidia/gpm/component.go b/components/accelerator/nvidia/gpm/component.go index bcead140..1b01659b 100644 --- a/components/accelerator/nvidia/gpm/component.go +++ b/components/accelerator/nvidia/gpm/component.go @@ -8,6 +8,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query_metrics_gpm "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/gpm" nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml" components_metrics_state "github.com/leptonai/gpud/components/metrics/state" @@ -19,7 +20,7 @@ import ( const Name = "accelerator-nvidia-gpm" -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() setDefaultPoller(cfg) diff --git a/components/accelerator/nvidia/gpm/component_output.go b/components/accelerator/nvidia/gpm/component_output.go index bf245000..ea248462 100644 --- a/components/accelerator/nvidia/gpm/component_output.go +++ b/components/accelerator/nvidia/gpm/component_output.go @@ -8,6 +8,7 @@ import ( "sync" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml" components_metrics "github.com/leptonai/gpud/components/metrics" "github.com/leptonai/gpud/components/query" @@ -96,7 +97,7 @@ var ( ) // only set once since it relies on the kube client and specific port -func setDefaultPoller(cfg Config) { +func setDefaultPoller(cfg nvidia_common.Config) { defaultPollerOnce.Do(func() { defaultPoller = query.New( Name, diff --git a/components/accelerator/nvidia/gpm/config.go b/components/accelerator/nvidia/gpm/config.go deleted file mode 100644 index 88faa6bc..00000000 --- a/components/accelerator/nvidia/gpm/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package gpm - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/gsp-firmware-mode/component.go b/components/accelerator/nvidia/gsp-firmware-mode/component.go index 0b41d495..0daee2d8 100644 --- a/components/accelerator/nvidia/gsp-firmware-mode/component.go +++ b/components/accelerator/nvidia/gsp-firmware-mode/component.go @@ -7,17 +7,25 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_gsp_firmware_mode_id "github.com/leptonai/gpud/components/accelerator/nvidia/gsp-firmware-mode/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" "github.com/leptonai/gpud/components/query" "github.com/leptonai/gpud/log" ) -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_gsp_firmware_mode_id.Name) return &component{ diff --git a/components/accelerator/nvidia/gsp-firmware-mode/config.go b/components/accelerator/nvidia/gsp-firmware-mode/config.go deleted file mode 100644 index b2a61c8b..00000000 --- a/components/accelerator/nvidia/gsp-firmware-mode/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package gspfirmwaremode - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/hw-slowdown/component.go b/components/accelerator/nvidia/hw-slowdown/component.go index a131ea2f..b5d7d67b 100644 --- a/components/accelerator/nvidia/hw-slowdown/component.go +++ b/components/accelerator/nvidia/hw-slowdown/component.go @@ -8,6 +8,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_hw_slowdown_id "github.com/leptonai/gpud/components/accelerator/nvidia/hw-slowdown/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_clock_events_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/clock-events-state" @@ -20,11 +21,18 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_hw_slowdown_id.Name) return &component{ diff --git a/components/accelerator/nvidia/hw-slowdown/config.go b/components/accelerator/nvidia/hw-slowdown/config.go deleted file mode 100644 index 522632af..00000000 --- a/components/accelerator/nvidia/hw-slowdown/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package hwslowdown - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/infiniband/component.go b/components/accelerator/nvidia/infiniband/component.go index 842407f7..fe4066db 100644 --- a/components/accelerator/nvidia/infiniband/component.go +++ b/components/accelerator/nvidia/infiniband/component.go @@ -18,7 +18,14 @@ func New(ctx context.Context, cfg Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_infiniband_id.Name) return &component{ diff --git a/components/accelerator/nvidia/infiniband/config.go b/components/accelerator/nvidia/infiniband/config.go index d342edb7..4b5ead1b 100644 --- a/components/accelerator/nvidia/infiniband/config.go +++ b/components/accelerator/nvidia/infiniband/config.go @@ -4,6 +4,7 @@ import ( "database/sql" "encoding/json" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" query_config "github.com/leptonai/gpud/components/query/config" ) @@ -11,6 +12,8 @@ type Config struct { Query query_config.Config `json:"query"` ExpectedPortStates + + nvidia_common.ToolOverwrites } // Configures the expected state of the ports. diff --git a/components/accelerator/nvidia/info/component.go b/components/accelerator/nvidia/info/component.go index 255b2545..0ffeddcf 100644 --- a/components/accelerator/nvidia/info/component.go +++ b/components/accelerator/nvidia/info/component.go @@ -7,6 +7,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" "github.com/leptonai/gpud/components/query" "github.com/leptonai/gpud/log" @@ -14,11 +15,18 @@ import ( const Name = "accelerator-nvidia-info" -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/info/config.go b/components/accelerator/nvidia/info/config.go deleted file mode 100644 index 28ec8612..00000000 --- a/components/accelerator/nvidia/info/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package info - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/memory/component.go b/components/accelerator/nvidia/memory/component.go index ae17686f..83b03851 100644 --- a/components/accelerator/nvidia/memory/component.go +++ b/components/accelerator/nvidia/memory/component.go @@ -8,6 +8,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query_metrics_memory "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/memory" "github.com/leptonai/gpud/components/query" @@ -18,11 +19,18 @@ import ( const Name = "accelerator-nvidia-memory" -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/memory/config.go b/components/accelerator/nvidia/memory/config.go deleted file mode 100644 index 31be6df5..00000000 --- a/components/accelerator/nvidia/memory/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package memory - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/nccl/component.go b/components/accelerator/nvidia/nccl/component.go index cca3a5d6..876925d7 100644 --- a/components/accelerator/nvidia/nccl/component.go +++ b/components/accelerator/nvidia/nccl/component.go @@ -9,6 +9,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_nccl_id "github.com/leptonai/gpud/components/accelerator/nvidia/nccl/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" "github.com/leptonai/gpud/components/dmesg" @@ -16,11 +17,18 @@ import ( "github.com/leptonai/gpud/log" ) -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_nccl_id.Name) return &component{ diff --git a/components/accelerator/nvidia/nccl/config.go b/components/accelerator/nvidia/nccl/config.go deleted file mode 100644 index 3e9c02ba..00000000 --- a/components/accelerator/nvidia/nccl/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package nccl - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/nvlink/component.go b/components/accelerator/nvidia/nvlink/component.go index b466377b..76f3d431 100644 --- a/components/accelerator/nvidia/nvlink/component.go +++ b/components/accelerator/nvidia/nvlink/component.go @@ -8,6 +8,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query_metrics_nvlink "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/nvlink" "github.com/leptonai/gpud/components/query" @@ -18,11 +19,18 @@ import ( const Name = "accelerator-nvidia-nvlink" -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/nvlink/config.go b/components/accelerator/nvidia/nvlink/config.go deleted file mode 100644 index f8804eb3..00000000 --- a/components/accelerator/nvidia/nvlink/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package nvlink - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/peermem/component.go b/components/accelerator/nvidia/peermem/component.go index 969e3b1d..202b945f 100644 --- a/components/accelerator/nvidia/peermem/component.go +++ b/components/accelerator/nvidia/peermem/component.go @@ -9,6 +9,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_peermem_id "github.com/leptonai/gpud/components/accelerator/nvidia/peermem/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" "github.com/leptonai/gpud/components/dmesg" @@ -16,11 +17,18 @@ import ( "github.com/leptonai/gpud/log" ) -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_peermem_id.Name) return &component{ diff --git a/components/accelerator/nvidia/peermem/config.go b/components/accelerator/nvidia/peermem/config.go deleted file mode 100644 index 1c394ab0..00000000 --- a/components/accelerator/nvidia/peermem/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package peermem - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/persistence-mode/component.go b/components/accelerator/nvidia/persistence-mode/component.go index 67687588..dd788132 100644 --- a/components/accelerator/nvidia/persistence-mode/component.go +++ b/components/accelerator/nvidia/persistence-mode/component.go @@ -7,17 +7,25 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_persistence_mode_id "github.com/leptonai/gpud/components/accelerator/nvidia/persistence-mode/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" "github.com/leptonai/gpud/components/query" "github.com/leptonai/gpud/log" ) -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_persistence_mode_id.Name) return &component{ diff --git a/components/accelerator/nvidia/persistence-mode/config.go b/components/accelerator/nvidia/persistence-mode/config.go deleted file mode 100644 index 70ce1f8f..00000000 --- a/components/accelerator/nvidia/persistence-mode/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package persistencemode - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/power/component.go b/components/accelerator/nvidia/power/component.go index e5708b26..76b7cda7 100644 --- a/components/accelerator/nvidia/power/component.go +++ b/components/accelerator/nvidia/power/component.go @@ -8,6 +8,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_power_id "github.com/leptonai/gpud/components/accelerator/nvidia/power/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query_metrics_power "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/power" @@ -17,11 +18,18 @@ import ( "github.com/prometheus/client_golang/prometheus" ) -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_power_id.Name) return &component{ diff --git a/components/accelerator/nvidia/power/config.go b/components/accelerator/nvidia/power/config.go deleted file mode 100644 index 0d7ae6a7..00000000 --- a/components/accelerator/nvidia/power/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package power - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/processes/component.go b/components/accelerator/nvidia/processes/component.go index a9d9c09f..a4f2732c 100644 --- a/components/accelerator/nvidia/processes/component.go +++ b/components/accelerator/nvidia/processes/component.go @@ -8,6 +8,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query_metrics_processes "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/processes" "github.com/leptonai/gpud/components/query" @@ -18,11 +19,18 @@ import ( const Name = "accelerator-nvidia-processes" -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/processes/config.go b/components/accelerator/nvidia/processes/config.go deleted file mode 100644 index a2fae1a3..00000000 --- a/components/accelerator/nvidia/processes/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package processes - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/query/infiniband/ibstat.go b/components/accelerator/nvidia/query/infiniband/ibstat.go index 41755e2a..89bdff6c 100644 --- a/components/accelerator/nvidia/query/infiniband/ibstat.go +++ b/components/accelerator/nvidia/query/infiniband/ibstat.go @@ -5,24 +5,51 @@ import ( "context" "errors" "fmt" - "os/exec" "strings" "github.com/leptonai/gpud/log" + "github.com/leptonai/gpud/pkg/process" "sigs.k8s.io/yaml" ) -func RunIbstat(ctx context.Context) (*IbstatOutput, error) { - p, err := exec.LookPath("ibstat") - if err != nil { - return nil, fmt.Errorf("ibstat not found (%w)", err) +func GetIbstatOutput(ctx context.Context, ibstatCommands []string) (*IbstatOutput, error) { + if len(ibstatCommands) == 0 { + ibstatCommands = []string{"ibstat"} } - b, err := exec.CommandContext(ctx, p).CombinedOutput() + + p, err := process.New( + process.WithCommand(ibstatCommands...), + process.WithRunAsBashScript(), + ) if err != nil { return nil, err } + + if err := p.Start(ctx); err != nil { + return nil, err + } + defer func() { + if err := p.Close(ctx); err != nil { + log.Logger.Warnw("failed to abort command", "err", err) + } + }() + + lines := make([]string, 0) + if err := process.Read( + ctx, + p, + process.WithReadStdout(), + process.WithReadStderr(), + process.WithProcessLine(func(line string) { + lines = append(lines, line) + }), + process.WithWaitForCmd(), + ); err != nil { + return nil, fmt.Errorf("failed to read ibstat output: %w\n\noutput:\n%s", err, strings.Join(lines, "\n")) + } + o := &IbstatOutput{ - Raw: string(b), + Raw: strings.Join(lines, "\n"), } // TODO: once stable return error diff --git a/components/accelerator/nvidia/query/infiniband/ibstat_test.go b/components/accelerator/nvidia/query/infiniband/ibstat_test.go index 39f47e3d..33eb2bb7 100644 --- a/components/accelerator/nvidia/query/infiniband/ibstat_test.go +++ b/components/accelerator/nvidia/query/infiniband/ibstat_test.go @@ -1,13 +1,40 @@ package infiniband import ( + "context" "errors" "os" "path/filepath" "reflect" "testing" + "time" ) +func TestGetIbstatOutput(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + // iterate all files in "testdata/" + matches, err := filepath.Glob("testdata/ibstat.*") + if err != nil { + t.Fatalf("failed to glob: %v", err) + } + + for _, queryFile := range matches { + o, err := GetIbstatOutput( + ctx, + []string{"cat", queryFile}, + ) + if err != nil { + t.Fatal(err) + } + + o.Raw = "" + + t.Logf("%q:\n%+v", queryFile, o) + } +} + func TestParseIBStat(t *testing.T) { input := `CA 'mlx5_0' CA type: MT4129 diff --git a/components/accelerator/nvidia/query/infiniband/infiniband.go b/components/accelerator/nvidia/query/infiniband/infiniband.go index 46d8a8eb..4469232c 100644 --- a/components/accelerator/nvidia/query/infiniband/infiniband.go +++ b/components/accelerator/nvidia/query/infiniband/infiniband.go @@ -5,7 +5,6 @@ import ( "context" "fmt" "os" - "os/exec" "strings" "github.com/leptonai/gpud/log" @@ -34,14 +33,6 @@ func SupportsInfinibandPortRate(gpuProductName string) int { return 0 } -func IbstatExists() bool { - p, err := exec.LookPath("ibstat") - if err != nil { - return false - } - return p != "" -} - // lspci | grep -i infiniband // 1a:00.0 Infiniband controller: Mellanox Technologies MT2910 Family [ConnectX-7] // 3c:00.0 Infiniband controller: Mellanox Technologies MT2910 Family [ConnectX-7] @@ -106,18 +97,11 @@ func CountInfinibandPCIBuses(ctx context.Context) (int, error) { // Counts the directories in "/sys/class/infiniband". // Returns 0 if the directory does not exist. func CountInfinibandClass() int { - info, err := os.Stat("/sys/class/infiniband") - if err != nil || !info.IsDir() { - return 0 - } - dirs, err := os.ReadDir("/sys/class/infiniband") - if err != nil { - return 0 - } - return len(dirs) + return CountInfinibandClassBySubDir("/sys/class/infiniband") } -func countInfinibandClass(dir string) int { +// Count the sub-directories under the specified directory. +func CountInfinibandClassBySubDir(dir string) int { info, err := os.Stat(dir) if err != nil || !info.IsDir() { return 0 diff --git a/components/accelerator/nvidia/query/infiniband/infiniband_test.go b/components/accelerator/nvidia/query/infiniband/infiniband_test.go index a8bc8324..f54337d3 100644 --- a/components/accelerator/nvidia/query/infiniband/infiniband_test.go +++ b/components/accelerator/nvidia/query/infiniband/infiniband_test.go @@ -5,7 +5,7 @@ import ( "testing" ) -func TestCountInfinibandClass(t *testing.T) { +func TestCountInfinibandClassBySubDir(t *testing.T) { t.Parallel() tests := []struct { @@ -58,7 +58,7 @@ func TestCountInfinibandClass(t *testing.T) { } } - got := countInfinibandClass(tt.dirPath) + got := CountInfinibandClassBySubDir(tt.dirPath) if got != tt.want { t.Errorf("countInfinibandClass() = %v, want %v", got, tt.want) } diff --git a/components/accelerator/nvidia/query/nvidia_smi_query.go b/components/accelerator/nvidia/query/nvidia_smi_query.go index e118652a..7c415089 100644 --- a/components/accelerator/nvidia/query/nvidia_smi_query.go +++ b/components/accelerator/nvidia/query/nvidia_smi_query.go @@ -9,6 +9,7 @@ import ( "fmt" "sort" "strings" + "sync" metrics_clock_events_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/clock-events-state" "github.com/leptonai/gpud/log" @@ -25,22 +26,16 @@ func SMIExists() bool { return err == nil } -func RunSMI(ctx context.Context, args ...string) ([]byte, error) { - log.Logger.Debugw("finding nvidia-smi") - nvidiaSMIPath, err := file.LocateExecutable("nvidia-smi") - if err != nil { - return nil, fmt.Errorf("nvidia-smi not found (%w)", err) - } - +func RunSMI(ctx context.Context, commandArgs []string) ([]byte, error) { p, err := process.New( - process.WithCommand(append([]string{nvidiaSMIPath}, args...)...), + process.WithCommand(commandArgs...), process.WithRunAsBashScript(), ) if err != nil { return nil, err } - log.Logger.Debugw("starting nvidia-smi", "args", args) + log.Logger.Debugw("starting nvidia-smi", "args", commandArgs) if err := p.Start(ctx); err != nil { return nil, err } @@ -71,6 +66,8 @@ func RunSMI(ctx context.Context, args ...string) ([]byte, error) { // [Sat Oct 12 18:38:44 2024] _nv042330rm+0x10/0x40 [nvidia] // [Sat Oct 12 18:38:44 2024] ? _nv043429rm+0x23c/0x290 errc := make(chan error, 1) + + mu := sync.Mutex{} lines := make([]string, 0) go func() { err := process.Read( @@ -79,35 +76,40 @@ func RunSMI(ctx context.Context, args ...string) ([]byte, error) { process.WithReadStdout(), process.WithReadStderr(), process.WithProcessLine(func(line string) { + mu.Lock() lines = append(lines, line) + mu.Unlock() }), process.WithWaitForCmd(), ) errc <- err }() - partialOutputErr := "" - if len(lines) > 0 { - partialOutputErr = fmt.Sprintf("\n\n(partial) output:\n%s", strings.Join(lines, "\n")) - } - select { case <-ctx.Done(): - return nil, fmt.Errorf("nvidia-smi command timed out: %w%s", ctx.Err(), partialOutputErr) + mu.Lock() + lineOutput := strings.Join(lines, "\n") + mu.Unlock() + + return nil, fmt.Errorf("nvidia-smi command timed out: %w\n\n(partial) output:\n%s", ctx.Err(), lineOutput) case err := <-errc: + mu.Lock() + lineOutput := strings.Join(lines, "\n") + mu.Unlock() + if err != nil { - return nil, fmt.Errorf("nvidia-smi command failed: %w%s", err, partialOutputErr) + return nil, fmt.Errorf("nvidia-smi command timed out: %w\n\n(partial) output:\n%s", err, lineOutput) } - return []byte(strings.Join(lines, "\n")), nil + return []byte(lineOutput), nil } } // Make sure to call this with a timeout, as a broken GPU may block the command. // e.g., // nvAssertOkFailedNoLog: Assertion failed: Call timed out [NV_ERR_TIMEOUT] (0x00000065) returned from pRmApi->Control(pRmApi, RES_GET_CLIENT_HANDLE(pKernelChannel), RES_GET_HANDLE(pKernelChannel), -func GetSMIOutput(ctx context.Context) (*SMIOutput, error) { - qb, err := RunSMI(ctx, "--query") +func GetSMIOutput(ctx context.Context, smiCmds []string, smiQueryCmds []string) (*SMIOutput, error) { + qb, err := RunSMI(ctx, smiQueryCmds) if err != nil { return nil, err } @@ -117,7 +119,7 @@ func GetSMIOutput(ctx context.Context) (*SMIOutput, error) { return nil, err } - sb, err := RunSMI(ctx) + sb, err := RunSMI(ctx, smiCmds) if err != nil { if IsErrDeviceHandleUnknownError(err) { o.SummaryFailure = err diff --git a/components/accelerator/nvidia/query/nvidia_smi_query_test.go b/components/accelerator/nvidia/query/nvidia_smi_query_test.go index cb6c50c3..beb8c27e 100644 --- a/components/accelerator/nvidia/query/nvidia_smi_query_test.go +++ b/components/accelerator/nvidia/query/nvidia_smi_query_test.go @@ -1,12 +1,44 @@ package query import ( + "context" "os" "path/filepath" "reflect" "testing" + "time" ) +func TestGetSMIOutput(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + // iterate all files in "testdata/" + matches, err := filepath.Glob("testdata/nvidia-smi-query.*.out.*.valid") + if err != nil { + t.Fatalf("failed to glob: %v", err) + } + + for _, queryFile := range matches { + o, err := GetSMIOutput( + ctx, + []string{"cat", "testdata/nvidia-smi.550.90.07.out.0.valid"}, + []string{"cat", queryFile}, + ) + if err != nil { + // TODO: fix + // CI can be flaky due to "cat" output being different + t.Logf("%q: %v", queryFile, err) + continue + } + + o.Raw = "" + o.Summary = "" + + t.Logf("%q:\n%+v", queryFile, o) + } +} + func TestParse4090Valid(t *testing.T) { data, err := os.ReadFile("testdata/nvidia-smi-query.535.154.05.out.0.valid.4090") if err != nil { diff --git a/components/accelerator/nvidia/query/nvml/nvml.go b/components/accelerator/nvidia/query/nvml/nvml.go index 2d3df19e..c82b969f 100644 --- a/components/accelerator/nvidia/query/nvml/nvml.go +++ b/components/accelerator/nvidia/query/nvml/nvml.go @@ -239,6 +239,7 @@ func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error) { nvmlExistsMsg: nvmlExistsMsg, dbRW: op.dbRW, + dbRO: op.dbRO, clockEventsSupported: clockEventsSupported, clockEventsHWSlowdownCh: make(chan *ClockEvents, 100), diff --git a/components/accelerator/nvidia/query/options.go b/components/accelerator/nvidia/query/options.go new file mode 100644 index 00000000..3f5d4edf --- /dev/null +++ b/components/accelerator/nvidia/query/options.go @@ -0,0 +1,80 @@ +package query + +import "database/sql" + +type Op struct { + dbRW *sql.DB + dbRO *sql.DB + nvidiaSMICommand string + nvidiaSMIQueryCommand string + ibstatCommand string + infinibandClassDirectory string + debug bool +} + +type OpOption func(*Op) + +func (op *Op) applyOpts(opts []OpOption) error { + for _, opt := range opts { + opt(op) + } + + if op.nvidiaSMICommand == "" { + op.nvidiaSMICommand = "nvidia-smi" + } + if op.nvidiaSMIQueryCommand == "" { + op.nvidiaSMIQueryCommand = "nvidia-smi --query" + } + if op.ibstatCommand == "" { + op.ibstatCommand = "ibstat" + } + if op.infinibandClassDirectory == "" { + op.infinibandClassDirectory = "/sys/class/infiniband" + } + + return nil +} + +func WithDBRW(db *sql.DB) OpOption { + return func(op *Op) { + op.dbRW = db + } +} + +func WithDBRO(db *sql.DB) OpOption { + return func(op *Op) { + op.dbRO = db + } +} + +// Specifies the nvidia-smi binary path to overwrite the default path. +func WithNvidiaSMICommand(p string) OpOption { + return func(op *Op) { + op.nvidiaSMICommand = p + } +} + +func WithNvidiaSMIQueryCommand(p string) OpOption { + return func(op *Op) { + op.nvidiaSMIQueryCommand = p + } +} + +// Specifies the ibstat binary path to overwrite the default path. +func WithIbstatCommand(p string) OpOption { + return func(op *Op) { + op.ibstatCommand = p + } +} + +func WithInfinibandClassDirectory(p string) OpOption { + return func(op *Op) { + op.infinibandClassDirectory = p + } +} + +func WithDebug(debug bool) OpOption { + return func(op *Op) { + op.debug = debug + } +} diff --git a/components/accelerator/nvidia/query/peermem/peermem.go b/components/accelerator/nvidia/query/peermem/peermem.go index b1984354..28dccc06 100644 --- a/components/accelerator/nvidia/query/peermem/peermem.go +++ b/components/accelerator/nvidia/query/peermem/peermem.go @@ -8,7 +8,6 @@ import ( "strings" "time" - "github.com/leptonai/gpud/components/accelerator/nvidia/query/infiniband" "github.com/leptonai/gpud/log" "github.com/leptonai/gpud/pkg/process" ) @@ -66,9 +65,7 @@ func CheckLsmodPeermemModule(ctx context.Context) (*LsmodPeermemModuleOutput, er } o := &LsmodPeermemModuleOutput{ - IbstatExists: infiniband.IbstatExists(), - InfinibandClassExists: infiniband.CountInfinibandClass() > 0, - Raw: strings.Join(lines, "\n"), + Raw: strings.Join(lines, "\n"), } o.IbcoreUsingPeermemModule = HasLsmodInfinibandPeerMem(o.Raw) @@ -102,8 +99,6 @@ func HasLsmodInfinibandPeerMem(lsmodOutput string) bool { } type LsmodPeermemModuleOutput struct { - IbstatExists bool `json:"ibstat_exists"` - InfinibandClassExists bool `json:"infiniband_class_exists"` Raw string `json:"raw"` IbcoreUsingPeermemModule bool `json:"ibcore_using_peermem_module"` } diff --git a/components/accelerator/nvidia/query/query.go b/components/accelerator/nvidia/query/query.go index c1a76bc0..3654393d 100644 --- a/components/accelerator/nvidia/query/query.go +++ b/components/accelerator/nvidia/query/query.go @@ -4,9 +4,9 @@ package query import ( "context" - "database/sql" "fmt" "os" + "strings" "sync" "time" @@ -28,6 +28,7 @@ import ( query_config "github.com/leptonai/gpud/components/query/config" "github.com/leptonai/gpud/components/systemd" "github.com/leptonai/gpud/log" + "github.com/leptonai/gpud/pkg/file" go_nvml "github.com/NVIDIA/go-nvml/pkg/nvml" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -40,7 +41,7 @@ var ( ) // only set once since it relies on the kube client and specific port -func SetDefaultPoller(dbRW *sql.DB, dbRO *sql.DB) { +func SetDefaultPoller(opts ...OpOption) { defaultPollerOnce.Do(func() { defaultPoller = query.New( "shared-nvidia-poller", @@ -51,7 +52,7 @@ func SetDefaultPoller(dbRW *sql.DB, dbRO *sql.DB) { Retention: metav1.Duration{Duration: query_config.DefaultStateRetention}, }, }, - CreateGet(dbRW, dbRO), + CreateGet(opts...), nil, ) }) @@ -70,21 +71,26 @@ func GetSuccessOnce() <-chan any { return getSuccessOnce } -func CreateGet(dbRW *sql.DB, dbRO *sql.DB) query.GetFunc { +func CreateGet(opts ...OpOption) query.GetFunc { return func(ctx context.Context) (_ any, e error) { // "ctx" here is the root level and used for instantiating the "shared" NVML instance "once" // and all other sub-calls have its own context timeouts, thus we do not set the timeout here // otherwise, we will cancel all future operations when the instance is created only once! - return Get(ctx, dbRW, dbRO) + return Get(ctx, opts...) } } // Get all nvidia component queries. -func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error) { +func Get(ctx context.Context, opts ...OpOption) (output any, err error) { + op := &Op{} + if err := op.applyOpts(opts); err != nil { + return nil, fmt.Errorf("failed to apply options: %w", err) + } + if err := nvml.StartDefaultInstance( ctx, - nvml.WithDBRW(dbRW), - nvml.WithDBRO(dbRO), + nvml.WithDBRW(op.dbRW), + nvml.WithDBRO(op.dbRO), nvml.WithGPMMetricsID( go_nvml.GPM_METRIC_SM_OCCUPANCY, go_nvml.GPM_METRIC_INTEGER_UTIL, @@ -100,12 +106,20 @@ func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error return nil, fmt.Errorf("failed to start nvml instance: %w", err) } + p, err := file.LocateExecutable(strings.Split(op.nvidiaSMICommand, " ")[0]) + smiExists := err == nil && p != "" + + p, err = file.LocateExecutable(strings.Split(op.ibstatCommand, " ")[0]) + ibstatExists := err == nil && p != "" + + ibClassCount := infiniband.CountInfinibandClassBySubDir(op.infinibandClassDirectory) + o := &Output{ Time: time.Now().UTC(), - SMIExists: SMIExists(), + SMIExists: smiExists, FabricManagerExists: FabricManagerExists(), - InfinibandClassExists: infiniband.CountInfinibandClass() > 0, - IbstatExists: infiniband.IbstatExists(), + InfinibandClassExists: ibClassCount > 0, + IbstatExists: ibstatExists, } log.Logger.Debugw("counting gpu devices") @@ -175,9 +189,9 @@ func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error } if o.InfinibandClassExists && o.IbstatExists { - log.Logger.Debugw("running ibstat") + log.Logger.Debugw("running ibstat", "command", op.ibstatCommand) cctx, ccancel := context.WithTimeout(ctx, 30*time.Second) - o.Ibstat, err = infiniband.RunIbstat(cctx) + o.Ibstat, err = infiniband.GetIbstatOutput(cctx, []string{op.ibstatCommand}) ccancel() if err != nil { if o.Ibstat == nil { @@ -248,7 +262,10 @@ func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error if o.SMIExists { // call this with a timeout, as a broken GPU may block the command. cctx, ccancel := context.WithTimeout(ctx, 2*time.Minute) - o.SMI, err = GetSMIOutput(cctx) + o.SMI, err = GetSMIOutput(cctx, + []string{op.nvidiaSMICommand}, + []string{op.nvidiaSMIQueryCommand}, + ) ccancel() if err != nil { o.SMIQueryErrors = append(o.SMIQueryErrors, err.Error()) @@ -265,7 +282,7 @@ func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error events := o.SMI.HWSlowdownEvents(truncNowUTC.Unix()) for _, event := range events { cctx, ccancel = context.WithTimeout(ctx, time.Minute) - found, err := metrics_clock_events_state.FindEvent(cctx, dbRO, event) + found, err := metrics_clock_events_state.FindEvent(cctx, op.dbRO, event) ccancel() if err != nil { o.SMIQueryErrors = append(o.SMIQueryErrors, fmt.Sprintf("failed to find clock events: %v", err)) @@ -275,7 +292,7 @@ func Get(ctx context.Context, dbRW *sql.DB, dbRO *sql.DB) (output any, err error continue } cctx, ccancel = context.WithTimeout(ctx, time.Minute) - err = metrics_clock_events_state.InsertEvent(cctx, dbRW, event) + err = metrics_clock_events_state.InsertEvent(cctx, op.dbRW, event) ccancel() if err != nil { o.SMIQueryErrors = append(o.SMIQueryErrors, fmt.Sprintf("failed to persist clock events: %v", err)) @@ -401,7 +418,12 @@ const ( warningSign = "\033[31m✘\033[0m" ) -func (o *Output) PrintInfo(debug bool) { +func (o *Output) PrintInfo(opts ...OpOption) { + options := &Op{} + if err := options.applyOpts(opts); err != nil { + log.Logger.Warnw("failed to apply options", "error", err) + } + if len(o.SMIQueryErrors) > 0 { fmt.Printf("%s nvidia-smi check failed with %d error(s)\n", warningSign, len(o.SMIQueryErrors)) for _, err := range o.SMIQueryErrors { @@ -441,6 +463,16 @@ func (o *Output) PrintInfo(debug bool) { } else { fmt.Printf("%s successfully checked ibstat\n", checkMark) } + + if o.Ibstat != nil { + atLeastPorts := infiniband.CountInfinibandClassBySubDir(options.infinibandClassDirectory) + atLeastRate := infiniband.SupportsInfinibandPortRate(o.GPUProductNameFromNVML()) + if err := o.Ibstat.Parsed.CheckPortsAndRate(atLeastPorts, atLeastRate); err != nil { + fmt.Printf("%s ibstat ports/rates check failed (%s)\n", warningSign, err) + } else { + fmt.Printf("%s ibstat ports/rates check passed (at least ports: %d, rate: %v)\n", checkMark, atLeastPorts, atLeastRate) + } + } } else { fmt.Printf("%s skipped ibstat check (infiniband class not found or ibstat not found)\n", checkMark) } @@ -545,7 +577,7 @@ func (o *Output) PrintInfo(debug bool) { } } - if debug { + if options.debug { copied := *o if copied.Ibstat != nil { copied.Ibstat.Raw = "" diff --git a/components/accelerator/nvidia/query/testdata/nvidia-smi.525.125.06.out.0.valid b/components/accelerator/nvidia/query/testdata/nvidia-smi.525.125.06.out.0.valid new file mode 100644 index 00000000..22fd2fed --- /dev/null +++ b/components/accelerator/nvidia/query/testdata/nvidia-smi.525.125.06.out.0.valid @@ -0,0 +1,11 @@ ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 525.125.06 Driver Version: 525.125.06 CUDA Version: 12.0 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|===============================+======================+======================| +| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | 0 | +|ERR! 38C P5 49W / 450W | 2021MiB / 23028MiB | 0% E. Process | +| | | N/A | ++-------------------------------+----------------------+----------------------+ \ No newline at end of file diff --git a/components/accelerator/nvidia/query/testdata/nvidia-smi.550.90.07.out.0.valid b/components/accelerator/nvidia/query/testdata/nvidia-smi.550.90.07.out.0.valid new file mode 100644 index 00000000..9d45fc68 --- /dev/null +++ b/components/accelerator/nvidia/query/testdata/nvidia-smi.550.90.07.out.0.valid @@ -0,0 +1,48 @@ +Mon Jan 20 10:18:26 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.90.07 Driver Version: 550.90.07 CUDA Version: 12.4 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 27C P0 72W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 24C P0 74W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 23C P0 70W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 23C P0 72W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 27C P0 73W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 22C P0 74W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 24C P0 73W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 23C P0 72W / 700W | 1MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ \ No newline at end of file diff --git a/components/accelerator/nvidia/remapped-rows/component.go b/components/accelerator/nvidia/remapped-rows/component.go index b9eba052..f9e1f5a1 100644 --- a/components/accelerator/nvidia/remapped-rows/component.go +++ b/components/accelerator/nvidia/remapped-rows/component.go @@ -8,6 +8,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query_metrics_remapped_rows "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/remapped-rows" "github.com/leptonai/gpud/components/query" @@ -18,11 +19,18 @@ import ( const Name = "accelerator-nvidia-remapped-rows" -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/remapped-rows/config.go b/components/accelerator/nvidia/remapped-rows/config.go deleted file mode 100644 index c298ffbd..00000000 --- a/components/accelerator/nvidia/remapped-rows/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package remappedrows - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/temperature/component.go b/components/accelerator/nvidia/temperature/component.go index 0816b0e7..63d1f59d 100644 --- a/components/accelerator/nvidia/temperature/component.go +++ b/components/accelerator/nvidia/temperature/component.go @@ -8,6 +8,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query_metrics_temperature "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/temperature" "github.com/leptonai/gpud/components/query" @@ -18,11 +19,18 @@ import ( const Name = "accelerator-nvidia-temperature" -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/temperature/config.go b/components/accelerator/nvidia/temperature/config.go deleted file mode 100644 index 04f371b1..00000000 --- a/components/accelerator/nvidia/temperature/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package temperature - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/accelerator/nvidia/utilization/component.go b/components/accelerator/nvidia/utilization/component.go index e5b7910e..c4e117e8 100644 --- a/components/accelerator/nvidia/utilization/component.go +++ b/components/accelerator/nvidia/utilization/component.go @@ -8,6 +8,7 @@ import ( "time" "github.com/leptonai/gpud/components" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query_metrics_utilization "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/utilization" "github.com/leptonai/gpud/components/query" @@ -18,11 +19,18 @@ import ( const Name = "accelerator-nvidia-utilization" -func New(ctx context.Context, cfg Config) components.Component { +func New(ctx context.Context, cfg nvidia_common.Config) components.Component { cfg.Query.SetDefaultsIfNotSet() cctx, ccancel := context.WithCancel(ctx) - nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO) + nvidia_query.SetDefaultPoller( + nvidia_query.WithDBRW(cfg.Query.State.DBRW), + nvidia_query.WithDBRO(cfg.Query.State.DBRO), + nvidia_query.WithNvidiaSMICommand(cfg.NvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(cfg.NvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(cfg.IbstatCommand), + nvidia_query.WithInfinibandClassDirectory(cfg.InfinibandClassDirectory), + ) nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name) return &component{ diff --git a/components/accelerator/nvidia/utilization/config.go b/components/accelerator/nvidia/utilization/config.go deleted file mode 100644 index d60445ef..00000000 --- a/components/accelerator/nvidia/utilization/config.go +++ /dev/null @@ -1,33 +0,0 @@ -package utilization - -import ( - "database/sql" - "encoding/json" - - query_config "github.com/leptonai/gpud/components/query/config" -) - -type Config struct { - Query query_config.Config `json:"query"` -} - -func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - if cfg.Query.State != nil { - cfg.Query.State.DBRW = dbRW - cfg.Query.State.DBRO = dbRO - } - return cfg, nil -} - -func (cfg Config) Validate() error { - return nil -} diff --git a/components/diagnose/diagnose.go b/components/diagnose/diagnose.go index 9b3e212f..9c37ca47 100644 --- a/components/diagnose/diagnose.go +++ b/components/diagnose/diagnose.go @@ -283,7 +283,11 @@ func run(ctx context.Context, dir string, opts ...OpOption) error { }) } else { fmt.Printf("%s checking nvidia-smi output\n", inProgress) - nout, err := nvidia_query.GetSMIOutput(ctx) + nout, err := nvidia_query.GetSMIOutput( + ctx, + []string{"nvidia-smi"}, + []string{"nvidia-smi", "--query"}, + ) if err != nil { o.Results = append(o.Results, CommandResult{ Command: "nvidia-smi -q", diff --git a/components/diagnose/options.go b/components/diagnose/options.go index d365d9e5..ee27423e 100644 --- a/components/diagnose/options.go +++ b/components/diagnose/options.go @@ -1,6 +1,11 @@ package diagnose type Op struct { + nvidiaSMICommand string + nvidiaSMIQueryCommand string + ibstatCommand string + infinibandClassDirectory string + lines int debug bool createArchive bool @@ -20,12 +25,52 @@ func (op *Op) applyOpts(opts []OpOption) error { for _, opt := range opts { opt(op) } + + if op.nvidiaSMICommand == "" { + op.nvidiaSMICommand = "nvidia-smi" + } + if op.nvidiaSMIQueryCommand == "" { + op.nvidiaSMIQueryCommand = "nvidia-smi --query" + } + if op.ibstatCommand == "" { + op.ibstatCommand = "ibstat" + } + if op.infinibandClassDirectory == "" { + op.infinibandClassDirectory = "/sys/class/infiniband" + } + if op.lines == 0 { op.lines = 100 } return nil } +// Specifies the nvidia-smi binary path to overwrite the default path. +func WithNvidiaSMICommand(p string) OpOption { + return func(op *Op) { + op.nvidiaSMICommand = p + } +} + +func WithNvidiaSMIQueryCommand(p string) OpOption { + return func(op *Op) { + op.nvidiaSMIQueryCommand = p + } +} + +// Specifies the ibstat binary path to overwrite the default path. +func WithIbstatCommand(p string) OpOption { + return func(op *Op) { + op.ibstatCommand = p + } +} + +func WithInfinibandClassDirectory(p string) OpOption { + return func(op *Op) { + op.infinibandClassDirectory = p + } +} + func WithLines(lines int) OpOption { return func(op *Op) { op.lines = lines diff --git a/components/diagnose/scan.go b/components/diagnose/scan.go index 1733ba0b..d66a63fa 100644 --- a/components/diagnose/scan.go +++ b/components/diagnose/scan.go @@ -9,6 +9,7 @@ import ( "time" "github.com/dustin/go-humanize" + "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_clock_events_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/clock-events-state" nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml" @@ -130,7 +131,15 @@ func Scan(ctx context.Context, opts ...OpOption) error { log.Logger.Fatalw("failed to create clock events state table", "error", err) } - outputRaw, err := nvidia_query.Get(ctx, db, db) + outputRaw, err := nvidia_query.Get( + ctx, + nvidia_query.WithDBRW(db), + nvidia_query.WithDBRO(db), + nvidia_query.WithNvidiaSMICommand(op.nvidiaSMICommand), + nvidia_query.WithNvidiaSMIQueryCommand(op.nvidiaSMIQueryCommand), + nvidia_query.WithIbstatCommand(op.ibstatCommand), + nvidia_query.WithInfinibandClassDirectory(op.infinibandClassDirectory), + ) if err != nil { log.Logger.Warnw("error getting nvidia info", "error", err) } else { @@ -145,7 +154,7 @@ func Scan(ctx context.Context, opts ...OpOption) error { if !ok { log.Logger.Warnf("expected *nvidia_query.Output, got %T", outputRaw) } else { - output.PrintInfo(op.debug) + output.PrintInfo(query.WithDebug(op.debug), query.WithInfinibandClassDirectory(op.infinibandClassDirectory)) if op.pollXidEvents { fmt.Printf("\n%s checking nvidia xid errors\n", inProgress) diff --git a/config/config.go b/config/config.go index 81247c1b..c8d87c65 100644 --- a/config/config.go +++ b/config/config.go @@ -46,6 +46,9 @@ type Config struct { // Configures the local web configuration. Web *Web `json:"web,omitempty"` + // Overwrites the tool binaries for testing. + ToolOverwriteOptions ToolOverwriteOptions `json:"tool_overwrite_options"` + // Set false to disable auto update EnableAutoUpdate bool `json:"enable_auto_update"` @@ -70,6 +73,13 @@ type Web struct { SincePeriod metav1.Duration `json:"since_period"` } +type ToolOverwriteOptions struct { + NvidiaSMICommand string `json:"nvidia_smi_command"` + NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"` + IbstatCommand string `json:"ibstat_command"` + InfinibandClassDirectory string `json:"infiniband_class_directory"` +} + var ErrInvalidAutoUpdateExitCode = errors.New("auto_update_exit_code is only valid when auto_update is enabled") func (config *Config) Validate() error { diff --git a/config/default.go b/config/default.go index 858a1993..afff3d9d 100644 --- a/config/default.go +++ b/config/default.go @@ -157,6 +157,13 @@ func DefaultConfig(ctx context.Context, opts ...OpOption) (*Config, error) { SincePeriod: DefaultRetentionPeriod, }, + ToolOverwriteOptions: ToolOverwriteOptions{ + NvidiaSMICommand: options.NvidiaSMICommand, + NvidiaSMIQueryCommand: options.NvidiaSMIQueryCommand, + IbstatCommand: options.IbstatCommand, + InfinibandClassDirectory: options.InfinibandClassDirectory, + }, + EnableAutoUpdate: true, } diff --git a/config/op_options.go b/config/op_options.go index 2be8fed0..603600f4 100644 --- a/config/op_options.go +++ b/config/op_options.go @@ -1,6 +1,9 @@ package config -import "github.com/leptonai/gpud/components/accelerator/nvidia/infiniband" +import ( + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" + "github.com/leptonai/gpud/components/accelerator/nvidia/infiniband" +) type Op struct { FilesToCheck []string @@ -8,6 +11,8 @@ type Op struct { ExpectedPortStates *infiniband.ExpectedPortStates DockerIgnoreConnectionErrors bool KubeletIgnoreConnectionErrors bool + + nvidia_common.ToolOverwrites } type OpOption func(*Op) @@ -49,3 +54,29 @@ func WithKubeletIgnoreConnectionErrors(b bool) OpOption { op.KubeletIgnoreConnectionErrors = b } } + +// Specifies the nvidia-smi binary path to overwrite the default path. +func WithNvidiaSMICommand(p string) OpOption { + return func(op *Op) { + op.NvidiaSMICommand = p + } +} + +func WithNvidiaSMIQueryCommand(p string) OpOption { + return func(op *Op) { + op.NvidiaSMIQueryCommand = p + } +} + +// Specifies the ibstat binary path to overwrite the default path. +func WithIbstatCommand(p string) OpOption { + return func(op *Op) { + op.IbstatCommand = p + } +} + +func WithInfinibandClassDirectory(p string) OpOption { + return func(op *Op) { + op.InfinibandClassDirectory = p + } +} diff --git a/internal/server/server.go b/internal/server/server.go index cf4e2681..ae980fb0 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -34,6 +34,7 @@ import ( nvidia_badenvs_id "github.com/leptonai/gpud/components/accelerator/nvidia/bad-envs/id" nvidia_clock_speed "github.com/leptonai/gpud/components/accelerator/nvidia/clock-speed" nvidia_clock_speed_id "github.com/leptonai/gpud/components/accelerator/nvidia/clock-speed/id" + nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_ecc "github.com/leptonai/gpud/components/accelerator/nvidia/ecc" nvidia_ecc_id "github.com/leptonai/gpud/components/accelerator/nvidia/ecc/id" nvidia_error "github.com/leptonai/gpud/components/accelerator/nvidia/error" @@ -728,9 +729,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, tailscale.New(ctx, cfg)) case nvidia_info.Name: - cfg := nvidia_info.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_info.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -742,9 +743,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_info.New(ctx, cfg)) case nvidia_badenvs_id.Name: - cfg := nvidia_badenvs.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_badenvs.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -756,9 +757,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_badenvs.New(ctx, cfg)) case nvidia_error.Name: - cfg := nvidia_error.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_error.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -771,9 +772,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID case nvidia_component_error_xid_id.Name: // "defaultQueryCfg" here has the db object to read xid events (read-only, writes are done in poller) - cfg := nvidia_error_xid.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_error_xid.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -789,9 +790,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_error_sxid.New()) case nvidia_component_error_xid_sxid_id.Name: - cfg := nvidia_component_error_xid_sxid.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_component_error_xid_sxid.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -803,9 +804,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_component_error_xid_sxid.New(ctx, cfg)) case nvidia_hw_slowdown_id.Name: - cfg := nvidia_hw_slowdown.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_hw_slowdown.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -817,9 +818,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_hw_slowdown.New(ctx, cfg)) case nvidia_clock_speed_id.Name: - cfg := nvidia_clock_speed.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_clock_speed.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -831,9 +832,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_clock_speed.New(ctx, cfg)) case nvidia_ecc_id.Name: - cfg := nvidia_ecc.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_ecc.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -845,9 +846,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_ecc.New(ctx, cfg)) case nvidia_memory.Name: - cfg := nvidia_memory.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_memory.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -859,9 +860,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_memory.New(ctx, cfg)) case nvidia_gpm.Name: - cfg := nvidia_gpm.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_gpm.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -873,9 +874,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_gpm.New(ctx, cfg)) case nvidia_nvlink.Name: - cfg := nvidia_nvlink.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_nvlink.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -887,9 +888,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_nvlink.New(ctx, cfg)) case nvidia_power_id.Name: - cfg := nvidia_power.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_power.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -901,9 +902,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_power.New(ctx, cfg)) case nvidia_temperature.Name: - cfg := nvidia_temperature.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_temperature.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -915,9 +916,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_temperature.New(ctx, cfg)) case nvidia_utilization.Name: - cfg := nvidia_utilization.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_utilization.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -929,9 +930,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_utilization.New(ctx, cfg)) case nvidia_processes.Name: - cfg := nvidia_processes.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_processes.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -943,9 +944,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_processes.New(ctx, cfg)) case nvidia_remapped_rows.Name: - cfg := nvidia_remapped_rows.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_remapped_rows.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -957,7 +958,11 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_remapped_rows.New(ctx, cfg)) case nvidia_fabric_manager.Name: - cfg := nvidia_fabric_manager.Config{Query: defaultQueryCfg, Log: nvidia_fabric_manager.DefaultLogConfig()} + cfg := nvidia_fabric_manager.Config{ + Query: defaultQueryCfg, + Log: nvidia_fabric_manager.DefaultLogConfig(), + ToolOverwrites: options.ToolOverwrites, + } if configValue != nil { parsed, err := nvidia_fabric_manager.ParseConfig(configValue, dbRW, dbRO) if err != nil { @@ -975,9 +980,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, fabricManagerLogComponent) case nvidia_gsp_firmware_mode_id.Name: - cfg := nvidia_gsp_firmware_mode.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_gsp_firmware_mode.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -990,7 +995,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID case nvidia_infiniband_id.Name: cfg := &nvidia_infiniband.Config{ - Query: defaultQueryCfg, + Query: defaultQueryCfg, + ExpectedPortStates: nvidia_infiniband.ExpectedPortStates{}, // for now, we set empty + ToolOverwrites: options.ToolOverwrites, } if configValue != nil { parsed, err := nvidia_infiniband.ParseConfig(configValue, dbRW, dbRO) @@ -1005,9 +1012,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_infiniband.New(ctx, *cfg)) case nvidia_peermem_id.Name: - cfg := nvidia_peermem.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_peermem.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -1019,9 +1026,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_peermem.New(ctx, cfg)) case nvidia_persistence_mode_id.Name: - cfg := nvidia_persistence_mode.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_persistence_mode.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } @@ -1033,9 +1040,9 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, nvidia_persistence_mode.New(ctx, cfg)) case nvidia_nccl_id.Name: - cfg := nvidia_nccl.Config{Query: defaultQueryCfg} + cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites} if configValue != nil { - parsed, err := nvidia_nccl.ParseConfig(configValue, dbRW, dbRO) + parsed, err := nvidia_common.ParseConfig(configValue, dbRW, dbRO) if err != nil { return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) } From 366895c91c0c057b9566ba9517e9d19fb8fda6ce Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Tue, 21 Jan 2025 21:53:35 +0800 Subject: [PATCH 2/4] add Signed-off-by: Gyuho Lee --- cmd/gpud/command/run.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmd/gpud/command/run.go b/cmd/gpud/command/run.go index 8a02bb88..882833d5 100644 --- a/cmd/gpud/command/run.go +++ b/cmd/gpud/command/run.go @@ -51,6 +51,11 @@ func cmdRun(cliContext *cli.Context) error { config.WithFilesToCheck(filesToCheck...), config.WithDockerIgnoreConnectionErrors(dockerIgnoreConnectionErrors), config.WithKubeletIgnoreConnectionErrors(kubeletIgnoreConnectionErrors), + + config.WithNvidiaSMICommand(nvidiaSMICommand), + config.WithNvidiaSMIQueryCommand(nvidiaSMIQueryCommand), + config.WithIbstatCommand(ibstatCommand), + config.WithInfinibandClassDirectory(infinibandClassDirectory), } if expectedPortStates != "" { From 9d994b6e8402a67934e8968355504c31ae8e135b Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Tue, 21 Jan 2025 21:56:57 +0800 Subject: [PATCH 3/4] lower log level Signed-off-by: Gyuho Lee --- pkg/process/process.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/process/process.go b/pkg/process/process.go index 6d8a49a4..c00f9931 100644 --- a/pkg/process/process.go +++ b/pkg/process/process.go @@ -313,7 +313,7 @@ func (p *process) watchCmd() { log.Logger.Warnw("command was terminated (exit code -1) for unknown reasons", "cmd", p.cmd.String()) } } else { - log.Logger.Warnw("command exited with non-zero status", "error", err, "cmd", p.cmd.String(), "exitCode", exitErr.ExitCode()) + log.Logger.Debugw("command exited with non-zero status", "error", err, "cmd", p.cmd.String(), "exitCode", exitErr.ExitCode()) } } else { log.Logger.Warnw("error waiting for command to finish", "error", err, "cmd", p.cmd.String()) From 3d02e9ab3e6278a2b8e4ab59a6851ecbc57c38a5 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Tue, 21 Jan 2025 22:12:12 +0800 Subject: [PATCH 4/4] debug Signed-off-by: Gyuho Lee --- pkg/process/process.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/process/process.go b/pkg/process/process.go index c00f9931..b077c23d 100644 --- a/pkg/process/process.go +++ b/pkg/process/process.go @@ -320,7 +320,7 @@ func (p *process) watchCmd() { } if p.restartConfig == nil || !p.restartConfig.OnError { - log.Logger.Warnw("process exited with error", "error", err) + log.Logger.Debugw("process exited with error", "error", err) return }