Skip to content

Commit

Permalink
simplify
Browse files Browse the repository at this point in the history
Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Jan 21, 2025
1 parent eb4d877 commit a3a0e61
Show file tree
Hide file tree
Showing 46 changed files with 108 additions and 802 deletions.
3 changes: 2 additions & 1 deletion components/accelerator/nvidia/bad-envs/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@ import (

"github.com/leptonai/gpud/components"
bad_envs_id "github.com/leptonai/gpud/components/accelerator/nvidia/bad-envs/id"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"
)

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
Expand Down
3 changes: 2 additions & 1 deletion components/accelerator/nvidia/clock-speed/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/leptonai/gpud/components"
nvidia_clock_speed_id "github.com/leptonai/gpud/components/accelerator/nvidia/clock-speed/id"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
nvidia_query_metrics_clockspeed "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/clock-speed"
"github.com/leptonai/gpud/components/query"
Expand All @@ -17,7 +18,7 @@ import (
"github.com/prometheus/client_golang/prometheus"
)

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
Expand Down
38 changes: 0 additions & 38 deletions components/accelerator/nvidia/clock-speed/config.go

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package badenvs
package common

import (
"database/sql"
Expand All @@ -10,6 +10,10 @@ import (
type Config struct {
Query query_config.Config `json:"query"`

ToolOverwrites
}

type ToolOverwrites struct {
NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
Expand Down
3 changes: 2 additions & 1 deletion components/accelerator/nvidia/ecc/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"time"

"github.com/leptonai/gpud/components"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_ecc_id "github.com/leptonai/gpud/components/accelerator/nvidia/ecc/id"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
nvidia_query_metrics_ecc "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/ecc"
Expand All @@ -17,7 +18,7 @@ import (
"github.com/prometheus/client_golang/prometheus"
)

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
Expand Down
38 changes: 0 additions & 38 deletions components/accelerator/nvidia/ecc/config.go

This file was deleted.

5 changes: 3 additions & 2 deletions components/accelerator/nvidia/error-xid-sxid/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/dustin/go-humanize"
"github.com/leptonai/gpud/components"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_error_xid_sxid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error-xid-sxid/id"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
nvidia_xid_sxid_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid-sxid-state"
Expand All @@ -18,7 +19,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

// this starts the Xid poller via "nvml.StartDefaultInstance"
Expand All @@ -44,7 +45,7 @@ func New(ctx context.Context, cfg Config) components.Component {
var _ components.Component = (*component)(nil)

type component struct {
cfg Config
cfg nvidia_common.Config
rootCtx context.Context
cancel context.CancelFunc
poller query.Poller
Expand Down
38 changes: 0 additions & 38 deletions components/accelerator/nvidia/error-xid-sxid/config.go

This file was deleted.

3 changes: 2 additions & 1 deletion components/accelerator/nvidia/error/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ import (
"time"

"github.com/leptonai/gpud/components"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"
)

const Name = "accelerator-nvidia-error"

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
Expand Down
38 changes: 0 additions & 38 deletions components/accelerator/nvidia/error/config.go

This file was deleted.

3 changes: 2 additions & 1 deletion components/accelerator/nvidia/error/xid/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/leptonai/gpud/components"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_component_error_xid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/id"
nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml"
nvidia_query_xid "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid"
Expand All @@ -17,7 +18,7 @@ import (
"github.com/leptonai/gpud/log"
)

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()
setDefaultPoller(cfg)

Expand Down
3 changes: 2 additions & 1 deletion components/accelerator/nvidia/error/xid/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"time"

"github.com/leptonai/gpud/components"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_component_error_xid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/id"
nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml"
nvidia_query_xid "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid"
Expand Down Expand Up @@ -223,7 +224,7 @@ var (
)

// only set once since it relies on the kube client and specific port
func setDefaultPoller(cfg Config) {
func setDefaultPoller(cfg nvidia_common.Config) {
defaultPollerOnce.Do(func() {
defaultPoller = query.New(
nvidia_component_error_xid_id.Name,
Expand Down
38 changes: 0 additions & 38 deletions components/accelerator/nvidia/error/xid/config.go

This file was deleted.

7 changes: 2 additions & 5 deletions components/accelerator/nvidia/fabric-manager/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"database/sql"
"encoding/json"

nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
fabric_manager_log "github.com/leptonai/gpud/components/accelerator/nvidia/query/fabric-manager-log"
query_config "github.com/leptonai/gpud/components/query/config"
query_log_common "github.com/leptonai/gpud/components/query/log/common"
Expand All @@ -15,11 +16,7 @@ import (
type Config struct {
Query query_config.Config `json:"query"`
Log query_log_config.Config `json:"log"`

NvidiaSMICommand string `json:"nvidia_smi_command"`
NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
IbstatCommand string `json:"ibstat_command"`
InfinibandClassDirectory string `json:"infiniband_class_directory"`
nvidia_common.ToolOverwrites
}

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
Expand Down
3 changes: 2 additions & 1 deletion components/accelerator/nvidia/gpm/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"time"

"github.com/leptonai/gpud/components"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_query_metrics_gpm "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/gpm"
nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml"
components_metrics_state "github.com/leptonai/gpud/components/metrics/state"
Expand All @@ -19,7 +20,7 @@ import (

const Name = "accelerator-nvidia-gpm"

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg nvidia_common.Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()
setDefaultPoller(cfg)

Expand Down
3 changes: 2 additions & 1 deletion components/accelerator/nvidia/gpm/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"sync"

"github.com/leptonai/gpud/components"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml"
components_metrics "github.com/leptonai/gpud/components/metrics"
"github.com/leptonai/gpud/components/query"
Expand Down Expand Up @@ -96,7 +97,7 @@ var (
)

// only set once since it relies on the kube client and specific port
func setDefaultPoller(cfg Config) {
func setDefaultPoller(cfg nvidia_common.Config) {
defaultPollerOnce.Do(func() {
defaultPoller = query.New(
Name,
Expand Down
Loading

0 comments on commit a3a0e61

Please sign in to comment.