Skip to content

Commit

Permalink
feat(nvml): remove unused clock events channel, use "Get" instead
Browse files Browse the repository at this point in the history
Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Jan 18, 2025
1 parent a4d8cf0 commit f26b07b
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 81 deletions.
67 changes: 0 additions & 67 deletions components/accelerator/nvidia/query/nvml/clock_events.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
package nvml

import (
"context"
"encoding/json"
"fmt"
"sort"
"time"

clock_events_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/clock-events-state"
"github.com/leptonai/gpud/log"

"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
Expand Down Expand Up @@ -264,68 +262,3 @@ func (inst *instance) ClockEventsSupported() bool {

return inst.clockEventsSupported
}

func (inst *instance) pollClockEvents() {
log.Logger.Debugw("polling clock events")

ticker := time.NewTicker(time.Minute)
defer ticker.Stop()

for {
select {
case <-inst.rootCtx.Done():
return
case <-ticker.C:
}

// nvidia-smi polling happens periodically
// so we truncate the timestamp to the nearest minute
truncNowUTC := time.Now().UTC().Truncate(time.Minute)

for _, dev := range inst.devices {
clockEvents, err := GetClockEvents(dev.UUID, dev.device)
if err != nil {
log.Logger.Errorw("failed to get clock events", "uuid", dev.UUID, "error", err)
continue
}
// overwrite timestamp to the nearest minute
clockEvents.Time = metav1.Time{Time: truncNowUTC}

// for now we only track hw slowdown events
if len(clockEvents.HWSlowdownReasons) == 0 {
continue
}

ev := clock_events_state.Event{
UnixSeconds: clockEvents.Time.Unix(),
DataSource: "nvml",
EventType: "hw_slowdown",
GPUUUID: dev.UUID,
Reasons: clockEvents.HWSlowdownReasons,
}

cctx, ccancel := context.WithTimeout(inst.rootCtx, 10*time.Second)
found, err := clock_events_state.FindEvent(cctx, inst.dbRO, ev)
ccancel()
if err != nil {
log.Logger.Errorw("failed to find clock events", "uuid", dev.UUID, "error", err)
continue
}
if found {
continue
}
cctx, ccancel = context.WithTimeout(inst.rootCtx, 10*time.Second)
err = clock_events_state.InsertEvent(cctx, inst.dbRW, ev)
ccancel()
if err != nil {
log.Logger.Errorw("failed to insert clock events into database", "uuid", dev.UUID, "error", err)
}

select {
case inst.clockEventsHWSlowdownCh <- &clockEvents:
default:
log.Logger.Warnw("hw slowdown clock events channel is full, dropping clock events", "uuid", dev.UUID)
}
}
}
}
45 changes: 31 additions & 14 deletions components/accelerator/nvidia/query/nvml/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"sync"
"time"

nvidia_clock_events_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/clock-events-state"
nvidia_xid_sxid_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid-sxid-state"
"github.com/leptonai/gpud/log"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -68,8 +69,7 @@ type instance struct {
// read-only database instance
dbRO *sql.DB

clockEventsSupported bool
clockEventsHWSlowdownCh chan *ClockEvents
clockEventsSupported bool

xidErrorSupported bool
xidEventMask uint64
Expand Down Expand Up @@ -240,8 +240,7 @@ func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error) {

dbRW: op.dbRW,

clockEventsSupported: clockEventsSupported,
clockEventsHWSlowdownCh: make(chan *ClockEvents, 100),
clockEventsSupported: clockEventsSupported,

xidErrorSupported: false,
xidEventSet: xidEventSet,
Expand Down Expand Up @@ -369,10 +368,6 @@ func (inst *instance) Start() error {
}
}

if inst.clockEventsSupported {
go inst.pollClockEvents()
}

if inst.xidErrorSupported {
go inst.pollXidEvents()
} else {
Expand Down Expand Up @@ -484,13 +479,35 @@ func (inst *instance) Get() (*Output, error) {
if inst.clockEventsSupported {
clockEvents, err := GetClockEvents(devInfo.UUID, devInfo.device)
if err != nil {
joinedErrs = append(joinedErrs, fmt.Errorf("%w (GPU uuid %s)", err, devInfo.UUID))
joinedErrs = append(joinedErrs, fmt.Errorf("failed to get clock events: %w (GPU uuid %s)", err, devInfo.UUID))
} else {
// overwrite timestamp to the nearest minute
clockEvents.Time = metav1.Time{Time: truncNowUTC}

latestInfo.ClockEvents = &clockEvents

ev := nvidia_clock_events_state.Event{
UnixSeconds: clockEvents.Time.Unix(),
DataSource: "nvml",
EventType: "hw_slowdown",
GPUUUID: devInfo.UUID,
Reasons: clockEvents.HWSlowdownReasons,
}

cctx, ccancel := context.WithTimeout(inst.rootCtx, 10*time.Second)
found, err := nvidia_clock_events_state.FindEvent(cctx, inst.dbRO, ev)
ccancel()
if err != nil {
joinedErrs = append(joinedErrs, fmt.Errorf("failed to find clock events: %w (GPU uuid %s)", err, devInfo.UUID))
} else if !found {
cctx, ccancel = context.WithTimeout(inst.rootCtx, 10*time.Second)
err = nvidia_clock_events_state.InsertEvent(cctx, inst.dbRW, ev)
ccancel()
if err != nil {
joinedErrs = append(joinedErrs, fmt.Errorf("failed to insert clock events: %w (GPU uuid %s)", err, devInfo.UUID))
}
}
}

// overwrite timestamp to the nearest minute
clockEvents.Time = metav1.Time{Time: truncNowUTC}

latestInfo.ClockEvents = &clockEvents
}

latestInfo.ClockSpeed, err = GetClockSpeed(devInfo.UUID, devInfo.device)
Expand Down

0 comments on commit f26b07b

Please sign in to comment.