Skip to content

Commit

Permalink
feat(nvidia): move event type to common, define xid/sxid event level …
Browse files Browse the repository at this point in the history
…in the list (#297)

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho authored Jan 22, 2025
1 parent 1ca5660 commit a453982
Show file tree
Hide file tree
Showing 19 changed files with 801 additions and 33 deletions.
5 changes: 3 additions & 2 deletions components/accelerator/nvidia/error-xid-sxid/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
nvidia_error_xid_sxid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error-xid-sxid/id"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
nvidia_xid_sxid_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid-sxid-state"
"github.com/leptonai/gpud/components/common"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"

Expand Down Expand Up @@ -92,7 +93,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E
convertedEvents = append(convertedEvents, components.Event{
Time: metav1.Time{Time: time.Unix(event.UnixSeconds, 0).UTC()},
Name: EventNameErroXid,
Type: components.EventTypeCritical,
Type: common.EventTypeCritical,
Message: msg,
ExtraInfo: map[string]string{
EventKeyUnixSeconds: strconv.FormatInt(event.UnixSeconds, 10),
Expand All @@ -114,7 +115,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E
convertedEvents = append(convertedEvents, components.Event{
Time: metav1.Time{Time: time.Unix(event.UnixSeconds, 0).UTC()},
Name: EventNameErroSXid,
Type: components.EventTypeCritical,
Type: common.EventTypeCritical,
Message: msg,
ExtraInfo: map[string]string{
EventKeyUnixSeconds: strconv.FormatInt(event.UnixSeconds, 10),
Expand Down
14 changes: 9 additions & 5 deletions components/accelerator/nvidia/error/sxid/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/leptonai/gpud/components"
nvidia_query_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/query/sxid"
"github.com/leptonai/gpud/components/common"
"github.com/leptonai/gpud/log"

"github.com/dustin/go-humanize"
Expand Down Expand Up @@ -88,18 +89,21 @@ func (o *Output) GetReason() Reason {

sxid := uint64(de.Detail.SXid)

reason.Errors = append(reason.Errors, SXidError{
sxidErr := SXidError{
Time: de.LogItem.Time,

DataSource: "dmesg",

DeviceUUID: de.DeviceUUID,

SXid: sxid,
}
if de.Detail != nil {
sxidErr.SuggestedActionsByGPUd = de.Detail.SuggestedActionsByGPUd
sxidErr.CriticalErrorMarkedByGPUd = de.Detail.CriticalErrorMarkedByGPUd
}

SuggestedActionsByGPUd: de.Detail.SuggestedActionsByGPUd,
CriticalErrorMarkedByGPUd: de.Detail.CriticalErrorMarkedByGPUd,
})
reason.Errors = append(reason.Errors, sxidErr)
}

sort.Slice(reason.Errors, func(i, j int) bool {
Expand Down Expand Up @@ -145,7 +149,7 @@ func (o *Output) getEvents(since time.Time) []components.Event {
des = append(des, components.Event{
Time: sxidErr.Time,
Name: EventNameErroSXid,
Type: components.EventTypeCritical,
Type: common.EventTypeCritical,
Message: msg,
ExtraInfo: map[string]string{
EventKeyErroSXidUnixSeconds: strconv.FormatInt(sxidErr.Time.Unix(), 10),
Expand Down
20 changes: 12 additions & 8 deletions components/accelerator/nvidia/error/xid/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ func (o *Output) GetReason() Reason {
suggestedActions = o.NVMLXidEvent.Detail.SuggestedActionsByGPUd
}

reason.Errors = append(reason.Errors, XidError{
xidErr := XidError{
Time: o.NVMLXidEvent.Time,

DataSource: "nvml",
Expand All @@ -136,7 +136,9 @@ func (o *Output) GetReason() Reason {

SuggestedActionsByGPUd: suggestedActions,
CriticalErrorMarkedByGPUd: o.NVMLXidEvent.Detail != nil && o.NVMLXidEvent.Detail.CriticalErrorMarkedByGPUd,
})
}

reason.Errors = append(reason.Errors, xidErr)
}

for _, de := range o.DmesgErrors {
Expand All @@ -145,19 +147,21 @@ func (o *Output) GetReason() Reason {
}

xid := uint64(de.Detail.Xid)

reason.Errors = append(reason.Errors, XidError{
xidErr := XidError{
Time: de.LogItem.Time,

DataSource: "dmesg",

DeviceUUID: de.DeviceUUID,

Xid: xid,
}
if de.Detail != nil {
xidErr.SuggestedActionsByGPUd = de.Detail.SuggestedActionsByGPUd
xidErr.CriticalErrorMarkedByGPUd = de.Detail.CriticalErrorMarkedByGPUd
}

SuggestedActionsByGPUd: de.Detail.SuggestedActionsByGPUd,
CriticalErrorMarkedByGPUd: de.Detail.CriticalErrorMarkedByGPUd,
})
reason.Errors = append(reason.Errors, xidErr)
}

sort.Slice(reason.Errors, func(i, j int) bool {
Expand Down Expand Up @@ -203,7 +207,7 @@ func (o *Output) getEvents(since time.Time) []components.Event {
des = append(des, components.Event{
Time: xidErr.Time,
Name: EventNameErroXid,
Type: components.EventTypeCritical,
Type: common.EventTypeCritical,
Message: msg,
ExtraInfo: map[string]string{
EventKeyErroXidUnixSeconds: strconv.FormatInt(xidErr.Time.Unix(), 10),
Expand Down
3 changes: 2 additions & 1 deletion components/accelerator/nvidia/fabric-manager/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/leptonai/gpud/components"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
fabric_manager_log "github.com/leptonai/gpud/components/accelerator/nvidia/query/fabric-manager-log"
"github.com/leptonai/gpud/components/common"
"github.com/leptonai/gpud/components/query"
query_log "github.com/leptonai/gpud/components/query/log"
"github.com/leptonai/gpud/log"
Expand Down Expand Up @@ -142,7 +143,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E
evs = append(evs, components.Event{
Time: ev.Time,
Name: Name,
Type: components.EventTypeCritical,
Type: common.EventTypeCritical,
ExtraInfo: map[string]string{
EventKeyFabricManagerNVSwitchLogUnixSeconds: fmt.Sprintf("%d", ev.Time.Unix()),
EventKeyFabricManagerNVSwitchLogLine: ev.Line,
Expand Down
3 changes: 2 additions & 1 deletion components/accelerator/nvidia/hw-slowdown/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
nvidia_hw_slowdown_state "github.com/leptonai/gpud/components/accelerator/nvidia/hw-slowdown/state"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
nvidia_query_metrics_clock "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/clock"
"github.com/leptonai/gpud/components/common"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"

Expand Down Expand Up @@ -172,7 +173,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E
convertedEvents = append(convertedEvents, components.Event{
Time: metav1.Time{Time: time.Unix(event.Timestamp, 0).UTC()},
Name: EventNameHWSlowdown,
Type: components.EventTypeWarning,
Type: common.EventTypeWarning,
Message: strings.Join(event.Reasons, ", "),
ExtraInfo: map[string]string{
EventKeyGPUUUID: event.GPUUUID,
Expand Down
3 changes: 2 additions & 1 deletion components/accelerator/nvidia/nccl/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_nccl_id "github.com/leptonai/gpud/components/accelerator/nvidia/nccl/id"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/common"
"github.com/leptonai/gpud/components/dmesg"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"
Expand Down Expand Up @@ -107,7 +108,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E
events = append(events, components.Event{
Time: logItem.Time,
Name: EventNameNCCLSegfaultInLibncclFromDmesg,
Type: components.EventTypeCritical,
Type: common.EventTypeCritical,
ExtraInfo: map[string]string{
EventKeyNCCLSegfaultInLibncclFromDmesgUnixSeconds: strconv.FormatInt(logItem.Time.Unix(), 10),
EventKeyNCCLSegfaultInLibncclFromDmesgLogLine: logItem.Line,
Expand Down
3 changes: 2 additions & 1 deletion components/accelerator/nvidia/peermem/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_peermem_id "github.com/leptonai/gpud/components/accelerator/nvidia/peermem/id"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/common"
"github.com/leptonai/gpud/components/dmesg"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"
Expand Down Expand Up @@ -164,7 +165,7 @@ func (c *component) getEvents(ctx context.Context, since time.Time, dmesgTailRes
events = append(events, components.Event{
Time: logItem.Time,
Name: EventNamePeermemInvalidContextFromDmesg,
Type: components.EventTypeCritical,
Type: common.EventTypeCritical,
ExtraInfo: map[string]string{
EventKeyPeermemInvalidContextFromDmesgUnixSeconds: strconv.FormatInt(logItem.Time.Unix(), 10),
EventKeyPeermemInvalidContextFromDmesgLogLine: logItem.Line,
Expand Down
Loading

0 comments on commit a453982

Please sign in to comment.