Skip to content

Commit

Permalink
feat(infiniband): default ports/rates
Browse files Browse the repository at this point in the history
remove expected-port-states flag

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Jan 21, 2025
1 parent 4d06119 commit 675c13a
Show file tree
Hide file tree
Showing 12 changed files with 80 additions and 115 deletions.
7 changes: 0 additions & 7 deletions cmd/gpud/command/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ var (
filesToCheck cli.StringSlice
kernelModulesToCheck cli.StringSlice

expectedPortStates string

dockerIgnoreConnectionErrors bool
kubeletIgnoreConnectionErrors bool

Expand Down Expand Up @@ -252,11 +250,6 @@ sudo rm /etc/systemd/system/gpud.service
Usage: "enable 'kernel-module' component that returns healthy if and only if all the kernel modules are loaded (default: [], use '--kernel-modules-to-check=a --kernel-modules-to-check=b' for multiple modules)",
Value: &kernelModulesToCheck,
},
&cli.StringFlag{
Name: "expected-port-states-nvidia-infiniband",
Usage: "set the expected port states for NVIDIA InfiniBand (e.g., --expected-port-states-nvidia-infiniband='{\"at_least_ports\": 4, \"at_least_rate\": 400}')",
Destination: &expectedPortStates,
},
&cli.BoolFlag{
Name: "docker-ignore-connection-errors",
Usage: "ignore connection errors to docker daemon, useful when docker daemon is not running (default: false)",
Expand Down
9 changes: 0 additions & 9 deletions cmd/gpud/command/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (
"runtime"
"time"

"github.com/leptonai/gpud/components/accelerator/nvidia/infiniband"
"github.com/leptonai/gpud/config"
lepServer "github.com/leptonai/gpud/internal/server"
"github.com/leptonai/gpud/log"
Expand Down Expand Up @@ -58,14 +57,6 @@ func cmdRun(cliContext *cli.Context) error {
config.WithInfinibandClassDirectory(infinibandClassDirectory),
}

if expectedPortStates != "" {
portStates := &infiniband.ExpectedPortStates{}
if err := json.Unmarshal([]byte(expectedPortStates), portStates); err != nil {
return err
}
configOpts = append(configOpts, config.WithExpectedPortStates(*portStates))
}

ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
cfg, err := config.DefaultConfig(ctx, configOpts...)
cancel()
Expand Down
1 change: 1 addition & 0 deletions components/accelerator/nvidia/infiniband/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
}

output := ToOutput(allOutput)

return output.States(c.cfg)
}

Expand Down
19 changes: 15 additions & 4 deletions components/accelerator/nvidia/infiniband/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/accelerator/nvidia/query/infiniband"
"github.com/leptonai/gpud/components/common"
"github.com/leptonai/gpud/log"
)

// ToOutput converts nvidia_query.Output to Output.
Expand Down Expand Up @@ -101,7 +102,7 @@ var (

// Returns the output evaluation reason and its healthy-ness.
// We DO NOT auto-detect infiniband devices/PCI buses, strictly rely on the user-specified config.
func (o *Output) Evaluate(cfg Config) (string, bool, error) {
func (o *Output) Evaluate(cfg ExpectedPortStates) (string, bool, error) {
// nothing specified for this machine, gpud MUST skip the ib check
if cfg.AtLeastPorts == 0 && cfg.AtLeastRate == 0 {
return msgNoAtLeastPortsOrRateSet, true, nil
Expand All @@ -123,8 +124,8 @@ func (o *Output) Evaluate(cfg Config) (string, bool, error) {
return msgNoIbstatDataFound, false, nil
}

atLeastPorts := cfg.ExpectedPortStates.AtLeastPorts
atLeastRate := cfg.ExpectedPortStates.AtLeastRate
atLeastPorts := cfg.AtLeastPorts
atLeastRate := cfg.AtLeastRate
if err := o.Ibstat.Parsed.CheckPortsAndRate(atLeastPorts, atLeastRate); err != nil {
return err.Error(), false, nil
}
Expand All @@ -133,7 +134,17 @@ func (o *Output) Evaluate(cfg Config) (string, bool, error) {
}

func (o *Output) States(cfg Config) ([]components.State, error) {
outputReasons, healthy, err := o.Evaluate(cfg)
// TODO: remove this once we have dynamic expected port states updates
// we only keep this for backwards compatibility
atLeastPorts := infiniband.CountInfinibandClassBySubDir(cfg.InfinibandClassDirectory)
atLeastRate := infiniband.SupportsInfinibandPortRate(o.GPUProductName)
log.Logger.Infow("setting default expected port states", "at_least_ports", atLeastPorts, "at_least_rate", atLeastRate, "gpu_product_name", o.GPUProductName)
SetDefaultExpectedPortStates(ExpectedPortStates{
AtLeastPorts: atLeastPorts,
AtLeastRate: atLeastRate,
})

outputReasons, healthy, err := o.Evaluate(GetDefaultExpectedPortStates())
if err != nil {
return nil, err
}
Expand Down
34 changes: 13 additions & 21 deletions components/accelerator/nvidia/infiniband/component_output_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ func TestOutputEvaluateEmptyConfig(t *testing.T) {
o := &Output{
GPUProductName: "NVIDIA A100",
}
cfg := Config{}
cfg := ExpectedPortStates{}
reason, healthy, err := o.Evaluate(cfg)
if err != nil {
t.Errorf("Evaluate() error = %v", err)
Expand Down Expand Up @@ -56,11 +56,9 @@ func TestOutputEvaluateH100(t *testing.T) {
},
},
}
cfg := Config{
ExpectedPortStates: ExpectedPortStates{
AtLeastPorts: 8,
AtLeastRate: 400,
},
cfg := ExpectedPortStates{
AtLeastPorts: 8,
AtLeastRate: 400,
}
reason, healthy, err := o.Evaluate(cfg)
if err != nil {
Expand All @@ -80,11 +78,9 @@ func TestOutputEvaluateNoIbstatExists(t *testing.T) {
GPUProductName: "NVIDIA H100",
IbstatExists: false,
}
cfg := Config{
ExpectedPortStates: ExpectedPortStates{
AtLeastPorts: 8,
AtLeastRate: 400,
},
cfg := ExpectedPortStates{
AtLeastPorts: 8,
AtLeastRate: 400,
}
reason, healthy, err := o.Evaluate(cfg)
if err != nil {
Expand All @@ -107,11 +103,9 @@ func TestOutputEvaluateNoIbstatDataFound(t *testing.T) {
Parsed: []infiniband.IBStatCard{},
},
}
cfg := Config{
ExpectedPortStates: ExpectedPortStates{
AtLeastPorts: 8,
AtLeastRate: 400,
},
cfg := ExpectedPortStates{
AtLeastPorts: 8,
AtLeastRate: 400,
}
reason, healthy, err := o.Evaluate(cfg)
if err != nil {
Expand Down Expand Up @@ -156,11 +150,9 @@ func TestOutputEvaluateH100MissingPort(t *testing.T) {
},
},
}
cfg := Config{
ExpectedPortStates: ExpectedPortStates{
AtLeastPorts: 8,
AtLeastRate: 400,
},
cfg := ExpectedPortStates{
AtLeastPorts: 8,
AtLeastRate: 400,
}
reason, healthy, err := o.Evaluate(cfg)
if err != nil {
Expand Down
23 changes: 21 additions & 2 deletions components/accelerator/nvidia/infiniband/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package infiniband
import (
"database/sql"
"encoding/json"
"sync"

nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
query_config "github.com/leptonai/gpud/components/query/config"
Expand All @@ -11,11 +12,29 @@ import (
type Config struct {
Query query_config.Config `json:"query"`

ExpectedPortStates

nvidia_common.ToolOverwrites
}

var (
defaultExpectedPortStatesMu sync.RWMutex
defaultExpectedPortStates = ExpectedPortStates{
AtLeastPorts: 0,
AtLeastRate: 0,
}
)

func GetDefaultExpectedPortStates() ExpectedPortStates {
defaultExpectedPortStatesMu.RLock()
defer defaultExpectedPortStatesMu.RUnlock()
return defaultExpectedPortStates
}

func SetDefaultExpectedPortStates(states ExpectedPortStates) {
defaultExpectedPortStatesMu.Lock()
defer defaultExpectedPortStatesMu.Unlock()
defaultExpectedPortStates = states
}

// Configures the expected state of the ports.
type ExpectedPortStates struct {
// The minimum number of ports.
Expand Down
55 changes: 0 additions & 55 deletions components/accelerator/nvidia/infiniband/config_test.go

This file was deleted.

4 changes: 4 additions & 0 deletions components/accelerator/nvidia/query/infiniband/ibstat.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ func (cards IBStatCards) Match(expectedPhysicalState string, expectedState strin

// CheckPortsAndRate checks if the number of active IB ports matches expectations
func (cards IBStatCards) CheckPortsAndRate(atLeastPorts int, atLeastRate int) error {
if atLeastPorts == 0 && atLeastRate == 0 {
return nil
}

totalPorts := len(cards)

// select all "up" devices, and count the ones that match the expected rate with ">="
Expand Down
24 changes: 24 additions & 0 deletions components/accelerator/nvidia/query/infiniband/ibstat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,30 @@ func TestValidateIBPorts(t *testing.T) {
atLeastRate: 200,
wantErr: errors.New("not enough LinkUp ports, only 2 LinkUp out of 4, expected at least 4 ports and 200 Gb/sec rate; some ports might be down, 2 Disabled devices with Rate > 200 found (mlx5_1, mlx5_3)"),
},
{
name: "some ports disabled but with high enough rate but missing ports/rates",
cards: IBStatCards{
{
Name: "mlx5_0",
Port1: IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200},
},
{
Name: "mlx5_1",
Port1: IBStatPort{State: "Down", PhysicalState: "Disabled", Rate: 200},
},
{
Name: "mlx5_2",
Port1: IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200},
},
{
Name: "mlx5_3",
Port1: IBStatPort{State: "Down", PhysicalState: "Disabled", Rate: 200},
},
},
atLeastPorts: 0,
atLeastRate: 0,
wantErr: nil,
},
{
name: "zero required ports",
cards: IBStatCards{
Expand Down
6 changes: 0 additions & 6 deletions config/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
nvidia_gpm "github.com/leptonai/gpud/components/accelerator/nvidia/gpm"
nvidia_gsp_firmware_mode_id "github.com/leptonai/gpud/components/accelerator/nvidia/gsp-firmware-mode/id"
nvidia_hw_slowdown_id "github.com/leptonai/gpud/components/accelerator/nvidia/hw-slowdown/id"
nvidia_infiniband "github.com/leptonai/gpud/components/accelerator/nvidia/infiniband"
nvidia_infiniband_id "github.com/leptonai/gpud/components/accelerator/nvidia/infiniband/id"
nvidia_info "github.com/leptonai/gpud/components/accelerator/nvidia/info"
nvidia_memory "github.com/leptonai/gpud/components/accelerator/nvidia/memory"
Expand Down Expand Up @@ -311,11 +310,6 @@ func DefaultConfig(ctx context.Context, opts ...OpOption) (*Config, error) {
cfg.Components[nvidia_fabric_manager.Name] = nil

cfg.Components[nvidia_infiniband_id.Name] = nil
if options.ExpectedPortStates != nil {
cfg.Components[nvidia_infiniband_id.Name] = &nvidia_infiniband.Config{
ExpectedPortStates: *options.ExpectedPortStates,
}
}

cfg.Components[nvidia_nccl_id.Name] = nil
cfg.Components[nvidia_peermem_id.Name] = nil
Expand Down
8 changes: 0 additions & 8 deletions config/op_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@ package config

import (
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
"github.com/leptonai/gpud/components/accelerator/nvidia/infiniband"
)

type Op struct {
FilesToCheck []string
KernelModulesToCheck []string
ExpectedPortStates *infiniband.ExpectedPortStates
DockerIgnoreConnectionErrors bool
KubeletIgnoreConnectionErrors bool

Expand Down Expand Up @@ -37,12 +35,6 @@ func WithKernelModulesToCheck(modules ...string) OpOption {
}
}

func WithExpectedPortStates(exp infiniband.ExpectedPortStates) OpOption {
return func(op *Op) {
op.ExpectedPortStates = &exp
}
}

func WithDockerIgnoreConnectionErrors(b bool) OpOption {
return func(op *Op) {
op.DockerIgnoreConnectionErrors = b
Expand Down
5 changes: 2 additions & 3 deletions internal/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -995,9 +995,8 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID

case nvidia_infiniband_id.Name:
cfg := &nvidia_infiniband.Config{
Query: defaultQueryCfg,
ExpectedPortStates: nvidia_infiniband.ExpectedPortStates{}, // for now, we set empty
ToolOverwrites: options.ToolOverwrites,
Query: defaultQueryCfg,
ToolOverwrites: options.ToolOverwrites,
}
if configValue != nil {
parsed, err := nvidia_infiniband.ParseConfig(configValue, dbRW, dbRO)
Expand Down

0 comments on commit 675c13a

Please sign in to comment.