diff --git a/components/accelerator/nvidia/error-xid-sxid/component.go b/components/accelerator/nvidia/error-xid-sxid/component.go index 162b3628..e0147046 100644 --- a/components/accelerator/nvidia/error-xid-sxid/component.go +++ b/components/accelerator/nvidia/error-xid-sxid/component.go @@ -13,6 +13,7 @@ import ( nvidia_error_xid_sxid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error-xid-sxid/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_xid_sxid_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid-sxid-state" + "github.com/leptonai/gpud/components/common" "github.com/leptonai/gpud/components/query" "github.com/leptonai/gpud/log" @@ -92,7 +93,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E convertedEvents = append(convertedEvents, components.Event{ Time: metav1.Time{Time: time.Unix(event.UnixSeconds, 0).UTC()}, Name: EventNameErroXid, - Type: components.EventTypeCritical, + Type: common.EventTypeCritical, Message: msg, ExtraInfo: map[string]string{ EventKeyUnixSeconds: strconv.FormatInt(event.UnixSeconds, 10), @@ -114,7 +115,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E convertedEvents = append(convertedEvents, components.Event{ Time: metav1.Time{Time: time.Unix(event.UnixSeconds, 0).UTC()}, Name: EventNameErroSXid, - Type: components.EventTypeCritical, + Type: common.EventTypeCritical, Message: msg, ExtraInfo: map[string]string{ EventKeyUnixSeconds: strconv.FormatInt(event.UnixSeconds, 10), diff --git a/components/accelerator/nvidia/error/sxid/component_output.go b/components/accelerator/nvidia/error/sxid/component_output.go index 88dfa3d5..f02486ce 100644 --- a/components/accelerator/nvidia/error/sxid/component_output.go +++ b/components/accelerator/nvidia/error/sxid/component_output.go @@ -10,6 +10,7 @@ import ( "github.com/leptonai/gpud/components" nvidia_query_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/query/sxid" + "github.com/leptonai/gpud/components/common" "github.com/leptonai/gpud/log" "github.com/dustin/go-humanize" @@ -88,7 +89,7 @@ func (o *Output) GetReason() Reason { sxid := uint64(de.Detail.SXid) - reason.Errors = append(reason.Errors, SXidError{ + sxidErr := SXidError{ Time: de.LogItem.Time, DataSource: "dmesg", @@ -96,10 +97,13 @@ func (o *Output) GetReason() Reason { DeviceUUID: de.DeviceUUID, SXid: sxid, + } + if de.Detail != nil { + sxidErr.SuggestedActionsByGPUd = de.Detail.SuggestedActionsByGPUd + sxidErr.CriticalErrorMarkedByGPUd = de.Detail.CriticalErrorMarkedByGPUd + } - SuggestedActionsByGPUd: de.Detail.SuggestedActionsByGPUd, - CriticalErrorMarkedByGPUd: de.Detail.CriticalErrorMarkedByGPUd, - }) + reason.Errors = append(reason.Errors, sxidErr) } sort.Slice(reason.Errors, func(i, j int) bool { @@ -145,7 +149,7 @@ func (o *Output) getEvents(since time.Time) []components.Event { des = append(des, components.Event{ Time: sxidErr.Time, Name: EventNameErroSXid, - Type: components.EventTypeCritical, + Type: common.EventTypeCritical, Message: msg, ExtraInfo: map[string]string{ EventKeyErroSXidUnixSeconds: strconv.FormatInt(sxidErr.Time.Unix(), 10), diff --git a/components/accelerator/nvidia/error/xid/component_output.go b/components/accelerator/nvidia/error/xid/component_output.go index 702d81f6..4e30d0eb 100644 --- a/components/accelerator/nvidia/error/xid/component_output.go +++ b/components/accelerator/nvidia/error/xid/component_output.go @@ -125,7 +125,7 @@ func (o *Output) GetReason() Reason { suggestedActions = o.NVMLXidEvent.Detail.SuggestedActionsByGPUd } - reason.Errors = append(reason.Errors, XidError{ + xidErr := XidError{ Time: o.NVMLXidEvent.Time, DataSource: "nvml", @@ -136,7 +136,9 @@ func (o *Output) GetReason() Reason { SuggestedActionsByGPUd: suggestedActions, CriticalErrorMarkedByGPUd: o.NVMLXidEvent.Detail != nil && o.NVMLXidEvent.Detail.CriticalErrorMarkedByGPUd, - }) + } + + reason.Errors = append(reason.Errors, xidErr) } for _, de := range o.DmesgErrors { @@ -145,8 +147,7 @@ func (o *Output) GetReason() Reason { } xid := uint64(de.Detail.Xid) - - reason.Errors = append(reason.Errors, XidError{ + xidErr := XidError{ Time: de.LogItem.Time, DataSource: "dmesg", @@ -154,10 +155,13 @@ func (o *Output) GetReason() Reason { DeviceUUID: de.DeviceUUID, Xid: xid, + } + if de.Detail != nil { + xidErr.SuggestedActionsByGPUd = de.Detail.SuggestedActionsByGPUd + xidErr.CriticalErrorMarkedByGPUd = de.Detail.CriticalErrorMarkedByGPUd + } - SuggestedActionsByGPUd: de.Detail.SuggestedActionsByGPUd, - CriticalErrorMarkedByGPUd: de.Detail.CriticalErrorMarkedByGPUd, - }) + reason.Errors = append(reason.Errors, xidErr) } sort.Slice(reason.Errors, func(i, j int) bool { @@ -203,7 +207,7 @@ func (o *Output) getEvents(since time.Time) []components.Event { des = append(des, components.Event{ Time: xidErr.Time, Name: EventNameErroXid, - Type: components.EventTypeCritical, + Type: common.EventTypeCritical, Message: msg, ExtraInfo: map[string]string{ EventKeyErroXidUnixSeconds: strconv.FormatInt(xidErr.Time.Unix(), 10), diff --git a/components/accelerator/nvidia/fabric-manager/component.go b/components/accelerator/nvidia/fabric-manager/component.go index 8c9dcbc8..d3925e58 100644 --- a/components/accelerator/nvidia/fabric-manager/component.go +++ b/components/accelerator/nvidia/fabric-manager/component.go @@ -10,6 +10,7 @@ import ( "github.com/leptonai/gpud/components" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" fabric_manager_log "github.com/leptonai/gpud/components/accelerator/nvidia/query/fabric-manager-log" + "github.com/leptonai/gpud/components/common" "github.com/leptonai/gpud/components/query" query_log "github.com/leptonai/gpud/components/query/log" "github.com/leptonai/gpud/log" @@ -142,7 +143,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E evs = append(evs, components.Event{ Time: ev.Time, Name: Name, - Type: components.EventTypeCritical, + Type: common.EventTypeCritical, ExtraInfo: map[string]string{ EventKeyFabricManagerNVSwitchLogUnixSeconds: fmt.Sprintf("%d", ev.Time.Unix()), EventKeyFabricManagerNVSwitchLogLine: ev.Line, diff --git a/components/accelerator/nvidia/hw-slowdown/component.go b/components/accelerator/nvidia/hw-slowdown/component.go index 87199a8c..12cd1450 100644 --- a/components/accelerator/nvidia/hw-slowdown/component.go +++ b/components/accelerator/nvidia/hw-slowdown/component.go @@ -14,6 +14,7 @@ import ( nvidia_hw_slowdown_state "github.com/leptonai/gpud/components/accelerator/nvidia/hw-slowdown/state" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query_metrics_clock "github.com/leptonai/gpud/components/accelerator/nvidia/query/metrics/clock" + "github.com/leptonai/gpud/components/common" "github.com/leptonai/gpud/components/query" "github.com/leptonai/gpud/log" @@ -172,7 +173,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E convertedEvents = append(convertedEvents, components.Event{ Time: metav1.Time{Time: time.Unix(event.Timestamp, 0).UTC()}, Name: EventNameHWSlowdown, - Type: components.EventTypeWarning, + Type: common.EventTypeWarning, Message: strings.Join(event.Reasons, ", "), ExtraInfo: map[string]string{ EventKeyGPUUUID: event.GPUUUID, diff --git a/components/accelerator/nvidia/nccl/component.go b/components/accelerator/nvidia/nccl/component.go index 876925d7..f3c70d74 100644 --- a/components/accelerator/nvidia/nccl/component.go +++ b/components/accelerator/nvidia/nccl/component.go @@ -12,6 +12,7 @@ import ( nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_nccl_id "github.com/leptonai/gpud/components/accelerator/nvidia/nccl/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" + "github.com/leptonai/gpud/components/common" "github.com/leptonai/gpud/components/dmesg" "github.com/leptonai/gpud/components/query" "github.com/leptonai/gpud/log" @@ -107,7 +108,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E events = append(events, components.Event{ Time: logItem.Time, Name: EventNameNCCLSegfaultInLibncclFromDmesg, - Type: components.EventTypeCritical, + Type: common.EventTypeCritical, ExtraInfo: map[string]string{ EventKeyNCCLSegfaultInLibncclFromDmesgUnixSeconds: strconv.FormatInt(logItem.Time.Unix(), 10), EventKeyNCCLSegfaultInLibncclFromDmesgLogLine: logItem.Line, diff --git a/components/accelerator/nvidia/peermem/component.go b/components/accelerator/nvidia/peermem/component.go index 202b945f..c6fdc50b 100644 --- a/components/accelerator/nvidia/peermem/component.go +++ b/components/accelerator/nvidia/peermem/component.go @@ -12,6 +12,7 @@ import ( nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common" nvidia_peermem_id "github.com/leptonai/gpud/components/accelerator/nvidia/peermem/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" + "github.com/leptonai/gpud/components/common" "github.com/leptonai/gpud/components/dmesg" "github.com/leptonai/gpud/components/query" "github.com/leptonai/gpud/log" @@ -164,7 +165,7 @@ func (c *component) getEvents(ctx context.Context, since time.Time, dmesgTailRes events = append(events, components.Event{ Time: logItem.Time, Name: EventNamePeermemInvalidContextFromDmesg, - Type: components.EventTypeCritical, + Type: common.EventTypeCritical, ExtraInfo: map[string]string{ EventKeyPeermemInvalidContextFromDmesgUnixSeconds: strconv.FormatInt(logItem.Time.Unix(), 10), EventKeyPeermemInvalidContextFromDmesgLogLine: logItem.Line, diff --git a/components/accelerator/nvidia/query/sxid/sxid.go b/components/accelerator/nvidia/query/sxid/sxid.go index 2949a1a8..c177b7a0 100644 --- a/components/accelerator/nvidia/query/sxid/sxid.go +++ b/components/accelerator/nvidia/query/sxid/sxid.go @@ -3,6 +3,7 @@ package sxid import ( "encoding/json" + "fmt" "github.com/leptonai/gpud/components/common" ) @@ -21,6 +22,8 @@ type Detail struct { // CriticalErrorMarkedByGPUd is true if the GPUd marks this SXid as a critical error. // You may use this field to decide whether to alert or not. CriticalErrorMarkedByGPUd bool `json:"critical_error_marked_by_gpud"` + // EventType is the type of the event. + EventType common.EventType `json:"event_type"` PotentialFatal bool `json:"potential_fatal"` AlwaysFatal bool `json:"always_fatal"` @@ -48,6 +51,9 @@ var defaultPotentialFatalErr = Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: true, AlwaysFatal: false, Impact: `If the error occurred on an NVSwitch access port, the impact will be limited to the corresponding guest VM. To recover, shut down the guest VM. @@ -71,6 +77,9 @@ var defaultAlwaysFatalErr = Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: true, AlwaysFatal: true, Impact: `Always fatal to the entire fabric/system.`, @@ -78,6 +87,15 @@ var defaultAlwaysFatalErr = Detail{ OtherImpact: "", } +// make sure we do not have unknown event type +func init() { + for id, detail := range details { + if detail.EventType == common.EventTypeUnknown || string(detail.EventType) == "" { + panic(fmt.Sprintf("unknown event type for SXid %d", id)) + } + } +} + // References: // "Non-Fatal NVSwitch SXid Errors" // "Fatal NVSwitch SXid Errors" @@ -118,6 +136,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: false, AlwaysFatal: false, Impact: "Corresponding GPU NVLink traffic will be stalled, and the subsequent GPU access will hang. The GPU driver on the guest VM will abort CUDA jobs with Xid 45.", @@ -135,6 +156,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -152,6 +176,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -169,6 +196,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -186,6 +216,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -203,6 +236,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -220,6 +256,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -237,6 +276,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -254,6 +296,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -271,6 +316,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -288,6 +336,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -305,6 +356,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -322,6 +376,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -339,6 +396,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -356,6 +416,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -373,6 +436,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -390,6 +456,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -407,6 +476,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -424,6 +496,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -441,6 +516,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "No guest VM impact because the NVSwitch hardware will auto correct the ECC errors.", @@ -455,9 +533,13 @@ var details = map[int]Detail{ Description: "", // NVLink packet needs to transmitted, may impact NVLink throughput + // no guest VM impact, NVSwitch hardware will auto correct the ECC errors SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "NVLink packet needs to be retransmitted. This error might impact the NVLink throughput of the specified port.", @@ -486,6 +568,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: false, AlwaysFatal: false, Impact: "Corresponding GPU NVLink traffic will be stalled, and subsequent GPU access will hang. The GPU driver on the guest VM will abort CUDA jobs with Xid 45.", @@ -514,6 +599,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: false, AlwaysFatal: false, Impact: "This error is usually accompanied by a fatal SXid error that will affect the corresponding GPU NVLink traffic.", @@ -528,9 +616,13 @@ var details = map[int]Detail{ Description: "", // this SXid can be safely ignored + // no guest VM impact, NVSwitch hardware will auto correct the ECC errors SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "This SXid can be safely ignored.", @@ -545,9 +637,13 @@ var details = map[int]Detail{ Description: "", // due to a broken/inconsistent connection or uncoordinated shutdown + // no guest VM impact, NVSwitch hardware will auto correct the ECC errors SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "This error could occur due to a broken/inconsistent connection or uncoordinated shutdown.", @@ -590,6 +686,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -620,6 +719,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -650,6 +752,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -680,6 +785,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -710,6 +818,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -740,6 +851,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -770,6 +884,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -800,6 +917,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -830,6 +950,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -860,6 +983,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -890,6 +1016,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -920,6 +1049,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -950,6 +1082,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -967,6 +1102,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -997,6 +1135,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1027,6 +1168,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1057,6 +1201,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1087,6 +1234,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1117,6 +1267,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1147,6 +1300,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1177,6 +1333,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1207,6 +1366,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1237,6 +1399,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1267,6 +1432,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1297,6 +1465,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1327,6 +1498,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1357,6 +1531,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1387,6 +1564,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1417,6 +1597,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1447,6 +1630,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1477,6 +1663,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1507,6 +1696,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1537,6 +1729,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1567,6 +1762,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1597,6 +1795,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1627,6 +1828,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1657,6 +1861,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact + ` @@ -1695,6 +1902,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: true, AlwaysFatal: false, Impact: "This error could occur due to a broken/inconsistent connection or uncoordinated shutdown.", @@ -1725,6 +1935,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1755,6 +1968,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1785,6 +2001,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1815,6 +2034,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultPotentialFatalErr.PotentialFatal, AlwaysFatal: defaultPotentialFatalErr.AlwaysFatal, Impact: defaultPotentialFatalErr.Impact, @@ -1857,6 +2079,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -1887,6 +2112,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -1917,6 +2145,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -1947,6 +2178,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -1977,6 +2211,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2007,6 +2244,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2037,6 +2277,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2067,6 +2310,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2097,6 +2343,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2127,6 +2376,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2157,6 +2409,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2187,6 +2442,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2217,6 +2475,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2247,6 +2508,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2277,6 +2541,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2307,6 +2574,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2337,6 +2607,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2367,6 +2640,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2397,6 +2673,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2427,6 +2706,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. }, CriticalErrorMarkedByGPUd: true, + // fatal; SXids whose GPUd.RepairActions has REBOOT_SYSTEM or HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + PotentialFatal: defaultAlwaysFatalErr.PotentialFatal, AlwaysFatal: defaultAlwaysFatalErr.AlwaysFatal, Impact: defaultAlwaysFatalErr.Impact, @@ -2456,6 +2738,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: true, AlwaysFatal: false, Impact: "", @@ -2473,6 +2758,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: true, AlwaysFatal: false, Impact: "", @@ -2490,6 +2778,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: true, AlwaysFatal: true, Impact: "If it occurs, it is fatal to the fabric/system.", @@ -2507,6 +2798,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: true, AlwaysFatal: false, Impact: "This error might force the specified NVSwitch Links to enter power saving mode (Single Lane Mode) and impact over the NVLink throughput.", @@ -2524,6 +2818,9 @@ Other Guest VM Impact: No impact if error is confined to a single GPU. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: true, AlwaysFatal: false, Impact: "", @@ -2555,9 +2852,13 @@ _RX_SHORT_ERROR_RATE in https://github.com/NVIDIA/open-gpu-kernel-modules/blob/d `, + // no guest VM impact, NVSwitch hardware will auto correct the ECC errors SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // warn; SXids whose SuggestedActionsByGPUd is none (CriticalErrorMarkedByGPUd=false) + EventType: common.EventTypeWarning, + PotentialFatal: false, AlwaysFatal: false, Impact: "", diff --git a/components/accelerator/nvidia/query/xid/xid.go b/components/accelerator/nvidia/query/xid/xid.go index 1e733b5a..ba0ea4df 100644 --- a/components/accelerator/nvidia/query/xid/xid.go +++ b/components/accelerator/nvidia/query/xid/xid.go @@ -3,6 +3,7 @@ package xid import ( "encoding/json" + "fmt" "github.com/leptonai/gpud/components/common" ) @@ -20,6 +21,8 @@ type Detail struct { // CriticalErrorMarkedByGPUd is true if the GPUd marks this Xid as a critical error. // You may use this field to decide whether to alert or not. CriticalErrorMarkedByGPUd bool `json:"critical_error_marked_by_gpud"` + // EventType is the type of the event. + EventType common.EventType `json:"event_type"` // PotentialHWError is true if the Xid indicates a potential hardware error. // Source: https://docs.nvidia.com/deploy/xid-errors/index.html#xid-error-listing @@ -105,6 +108,15 @@ func GetDetail(id int) (*Detail, bool) { return &e, ok } +// make sure we do not have unknown event type +func init() { + for id, detail := range details { + if detail.EventType == common.EventTypeUnknown || string(detail.EventType) == "" { + panic(fmt.Sprintf("unknown event type for Xid %d", id)) + } + } +} + // Copied from https://docs.nvidia.com/deploy/xid-details/index.html#xid-error-listing. // See https://docs.nvidia.com/deploy/gpu-debug-guidelines/index.html#xid-messages for more details. var details = map[int]Detail{ @@ -117,6 +129,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -136,6 +151,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -155,6 +173,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -174,6 +195,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -193,6 +217,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -212,6 +239,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -231,6 +261,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -250,6 +283,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -284,6 +320,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -303,6 +342,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -322,6 +364,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -356,6 +401,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -418,6 +466,10 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is IGNORE_NO_ACTION_REQUIRED without REBOOT_SYSTEM/HARDWARE_INSPECTION + // Xids whose GPUd.RepairActions is CHECK_USER_APP_AND_GPU without REBOOT_SYSTEM/HARDWARE_INSPECTION + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -441,6 +493,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -460,6 +515,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -494,6 +552,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -513,6 +574,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -547,6 +611,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -581,6 +648,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -600,6 +670,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -619,6 +692,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -638,6 +714,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -657,6 +736,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -676,6 +758,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -695,6 +780,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -729,6 +817,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -763,6 +854,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -797,6 +891,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -831,6 +928,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -865,6 +965,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -932,6 +1035,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is CHECK_USER_APP_AND_GPU but HARDWARE_INSPECTION when the issue persists + EventType: common.EventTypeCritical, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -1003,6 +1109,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1037,6 +1146,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1071,6 +1183,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1105,6 +1220,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1139,6 +1257,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1158,6 +1279,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1223,6 +1347,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1242,6 +1369,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1261,6 +1391,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1280,6 +1413,9 @@ var details = map[int]Detail{ SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1314,6 +1450,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1378,6 +1517,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is CHECK_USER_APP_AND_GPU but HARDWARE_INSPECTION when the issue persists + EventType: common.EventTypeCritical, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1447,6 +1589,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1516,6 +1661,10 @@ var details = map[int]Detail{ // unhealthy if there's no previous Xid event in the same time window CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is IGNORE_NO_ACTION_REQUIRED without REBOOT_SYSTEM/HARDWARE_INSPECTION + // Xids whose GPUd.RepairActions is CHECK_USER_APP_AND_GPU without REBOOT_SYSTEM/HARDWARE_INSPECTION + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1554,6 +1703,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1588,6 +1740,9 @@ var details = map[int]Detail{ }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1666,6 +1821,9 @@ The error is also reported to your application. In most cases, you need to reset }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -1685,6 +1843,9 @@ The error is also reported to your application. In most cases, you need to reset SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1704,6 +1865,9 @@ The error is also reported to your application. In most cases, you need to reset SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1723,6 +1887,9 @@ The error is also reported to your application. In most cases, you need to reset SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1742,6 +1909,9 @@ The error is also reported to your application. In most cases, you need to reset SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1761,6 +1931,9 @@ The error is also reported to your application. In most cases, you need to reset SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1780,6 +1953,9 @@ The error is also reported to your application. In most cases, you need to reset SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1799,6 +1975,9 @@ The error is also reported to your application. In most cases, you need to reset SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1818,6 +1997,9 @@ The error is also reported to your application. In most cases, you need to reset SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -1837,6 +2019,9 @@ The error is also reported to your application. In most cases, you need to reset SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -1856,6 +2041,9 @@ The error is also reported to your application. In most cases, you need to reset SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -1890,6 +2078,9 @@ The error is also reported to your application. In most cases, you need to reset }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1924,6 +2115,9 @@ The error is also reported to your application. In most cases, you need to reset }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -1991,6 +2185,9 @@ Internal micro-controller breakpoint/warning. The GPU internal engine stops work }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -2057,6 +2254,9 @@ Internal micro-controller breakpoint/warning. The GPU internal engine stops work }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2146,6 +2346,9 @@ Xid 63 indicates that the retirement or remapping information is successfully re }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2235,6 +2438,9 @@ Xid 64 indicates that the retirement or remapping information fails to be record }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2271,6 +2477,9 @@ Xid 64 indicates that the retirement or remapping information fails to be record }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM but no immediate reboot is required + EventType: common.EventTypeCritical, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2290,6 +2499,9 @@ Xid 64 indicates that the retirement or remapping information fails to be record SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -2309,6 +2521,9 @@ Xid 64 indicates that the retirement or remapping information fails to be record SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -2377,6 +2592,9 @@ Xid 64 indicates that the retirement or remapping information fails to be record // ignore first xid 68 occurrences CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2447,6 +2665,9 @@ Xid 64 indicates that the retirement or remapping information fails to be record }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2466,6 +2687,9 @@ Xid 64 indicates that the retirement or remapping information fails to be record SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2485,6 +2709,9 @@ Xid 64 indicates that the retirement or remapping information fails to be record SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2504,6 +2731,9 @@ Xid 64 indicates that the retirement or remapping information fails to be record SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2523,6 +2753,9 @@ Xid 64 indicates that the retirement or remapping information fails to be record SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2620,6 +2853,9 @@ The XID indicates an NVLink hardware error. The GPU encounters a critical hardwa }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2639,6 +2875,9 @@ The XID indicates an NVLink hardware error. The GPU encounters a critical hardwa SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2658,6 +2897,9 @@ The XID indicates an NVLink hardware error. The GPU encounters a critical hardwa SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2677,6 +2919,9 @@ The XID indicates an NVLink hardware error. The GPU encounters a critical hardwa SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2711,6 +2956,9 @@ The XID indicates an NVLink hardware error. The GPU encounters a critical hardwa }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -2785,6 +3033,9 @@ This event may also be cause by failing GPU hardware or other driver issues. }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2804,6 +3055,9 @@ This event may also be cause by failing GPU hardware or other driver issues. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2838,6 +3092,9 @@ This event may also be cause by failing GPU hardware or other driver issues. }, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2857,6 +3114,9 @@ This event may also be cause by failing GPU hardware or other driver issues. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2876,6 +3136,9 @@ This event may also be cause by failing GPU hardware or other driver issues. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2895,6 +3158,9 @@ This event may also be cause by failing GPU hardware or other driver issues. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2914,6 +3180,9 @@ This event may also be cause by failing GPU hardware or other driver issues. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2933,6 +3202,9 @@ This event may also be cause by failing GPU hardware or other driver issues. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2952,6 +3224,9 @@ This event may also be cause by failing GPU hardware or other driver issues. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -2971,6 +3246,9 @@ This event may also be cause by failing GPU hardware or other driver issues. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -2990,6 +3268,9 @@ This event may also be cause by failing GPU hardware or other driver issues. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3009,6 +3290,9 @@ This event may also be cause by failing GPU hardware or other driver issues. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -3028,6 +3312,9 @@ This event may also be cause by failing GPU hardware or other driver issues. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -3098,6 +3385,10 @@ See below for guidelines on when to RMA GPUs based on excessive errors. }, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is IGNORE_NO_ACTION_REQUIRED without REBOOT_SYSTEM/HARDWARE_INSPECTION + // Xids whose GPUd.RepairActions is CHECK_USER_APP_AND_GPU without REBOOT_SYSTEM/HARDWARE_INSPECTION + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3117,6 +3408,9 @@ See below for guidelines on when to RMA GPUs based on excessive errors. SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -3208,6 +3502,9 @@ This event is generated if the error suppression mechanism successfully suppress // ignore the first few occurrences and then suggest reboot CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM but no immediate reboot is required + EventType: common.EventTypeCritical, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3295,6 +3592,10 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + // Xids whose GPUd.RepairActions is HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3314,6 +3615,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3333,6 +3637,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3352,6 +3659,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3371,6 +3681,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3390,6 +3703,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3409,6 +3725,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3428,6 +3747,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3447,6 +3769,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3466,6 +3791,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3485,6 +3813,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3504,6 +3835,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -3523,6 +3857,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -3542,6 +3879,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -3561,6 +3901,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3595,6 +3938,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup }, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is HARDWARE_INSPECTION + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3614,6 +3960,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3633,6 +3982,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3652,6 +4004,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3671,6 +4026,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3690,6 +4048,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3709,6 +4070,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3728,6 +4092,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3747,6 +4114,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3814,6 +4184,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3881,6 +4254,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3948,6 +4324,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -3967,6 +4346,9 @@ This event is similar to Xid 94. However, Xid 94 indicates that the error is sup SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -4034,6 +4416,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi }, CriticalErrorMarkedByGPUd: true, // only because it requires reboot + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -4053,6 +4438,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -4072,6 +4460,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -4091,6 +4482,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4110,6 +4504,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4129,6 +4526,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4148,6 +4548,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4167,6 +4570,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4186,6 +4592,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4205,6 +4614,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4224,6 +4636,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4243,6 +4658,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4262,6 +4680,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4326,6 +4747,10 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi }, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is IGNORE_NO_ACTION_REQUIRED without REBOOT_SYSTEM/HARDWARE_INSPECTION + // Xids whose GPUd.RepairActions is CHECK_USER_APP_AND_GPU without REBOOT_SYSTEM/HARDWARE_INSPECTION + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4345,6 +4770,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4364,6 +4792,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4431,6 +4862,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi }, CriticalErrorMarkedByGPUd: true, + // Xids whose GPUd.RepairActions is REBOOT_SYSTEM + EventType: common.EventTypeFatal, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, @@ -4450,6 +4884,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4469,6 +4906,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: false, @@ -4488,6 +4928,9 @@ Report a GPU issue and reset GPU(s) reporting the XID (refer to GPU reset capabi SuggestedActionsByGPUd: nil, CriticalErrorMarkedByGPUd: false, + // Xids whose GPUd.RepairActions is empty + EventType: common.EventTypeWarning, + // below are defined in https://docs.nvidia.com/deploy/xid-errors/index.html // only indicates potential causes thus we do not solely rely on them PotentialHWError: true, diff --git a/components/event_types.go b/components/common/event_types.go similarity index 63% rename from components/event_types.go rename to components/common/event_types.go index 4ae16301..46b89855 100644 --- a/components/event_types.go +++ b/components/common/event_types.go @@ -1,22 +1,26 @@ -package components +package common type EventType string const ( EventTypeUnknown EventType = "Unknown" - // EventTypeInfo represents a general event that requires no action + // EventTypeInfo represents a general event that requires no action. + // Info - Informative, no further action needed. EventTypeInfo EventType = "Info" - // EventTypeWarning represents an event that may impact workloads + // EventTypeWarning represents an event that may impact workloads. + // Warning - Some issue happened but no further action needed, expecting automatic recovery. EventTypeWarning EventType = "Warning" // EventTypeCritical represents an event that is definitely impacting workloads - // and requires immediate attention + // and requires immediate attention. + // Critical - Some critical issue happened thus action required, not a hardware issue. EventTypeCritical EventType = "Critical" // EventTypeFatal represents a fatal event that impacts wide systems - // and requires immediate attention and action + // and requires immediate attention and action. + // Fatal - Fatal/hardware issue occurred thus immediate action required, may require reboot/hardware repair. EventTypeFatal EventType = "Fatal" ) diff --git a/components/event_types_test.go b/components/common/event_types_test.go similarity index 98% rename from components/event_types_test.go rename to components/common/event_types_test.go index c3abcafc..3b6e2344 100644 --- a/components/event_types_test.go +++ b/components/common/event_types_test.go @@ -1,4 +1,4 @@ -package components +package common import "testing" diff --git a/components/components.go b/components/components.go index 83fb9103..68d6c179 100644 --- a/components/components.go +++ b/components/components.go @@ -79,7 +79,7 @@ type State struct { type Event struct { Time metav1.Time `json:"time"` Name string `json:"name,omitempty"` - Type EventType `json:"type,omitempty"` + Type common.EventType `json:"type,omitempty"` Message string `json:"message,omitempty"` // detailed message of the event ExtraInfo map[string]string `json:"extra_info,omitempty"` // any extra information the component may want to expose SuggestedActions *common.SuggestedActions `json:"suggested_actions,omitempty"` diff --git a/components/cpu/component.go b/components/cpu/component.go index bcda7f33..7e01f241 100644 --- a/components/cpu/component.go +++ b/components/cpu/component.go @@ -9,6 +9,7 @@ import ( "time" "github.com/leptonai/gpud/components" + "github.com/leptonai/gpud/components/common" cpu_id "github.com/leptonai/gpud/components/cpu/id" "github.com/leptonai/gpud/components/cpu/metrics" "github.com/leptonai/gpud/components/dmesg" @@ -138,7 +139,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E events = append(events, components.Event{ Time: ev.Time, Name: name, - Type: components.EventTypeWarning, + Type: common.EventTypeWarning, ExtraInfo: map[string]string{ EventKeyUnixSeconds: strconv.FormatInt(ev.Time.Unix(), 10), EventKeyLogLine: item.Line, diff --git a/components/dmesg/component_event.go b/components/dmesg/component_event.go index 2f2ac541..fc22b22a 100644 --- a/components/dmesg/component_event.go +++ b/components/dmesg/component_event.go @@ -6,6 +6,7 @@ import ( "strings" "github.com/leptonai/gpud/components" + "github.com/leptonai/gpud/components/common" query_log "github.com/leptonai/gpud/components/query/log" "github.com/leptonai/gpud/log" ) @@ -62,7 +63,7 @@ func (ev *Event) Events() []components.Event { Name: EventNameDmesgMatched, // criticality should be decided in individual components - Type: components.EventTypeWarning, + Type: common.EventTypeWarning, Message: msg, ExtraInfo: map[string]string{ diff --git a/components/fd/component.go b/components/fd/component.go index 2ee574cd..6235e478 100644 --- a/components/fd/component.go +++ b/components/fd/component.go @@ -9,6 +9,7 @@ import ( "time" "github.com/leptonai/gpud/components" + "github.com/leptonai/gpud/components/common" "github.com/leptonai/gpud/components/dmesg" fd_id "github.com/leptonai/gpud/components/fd/id" "github.com/leptonai/gpud/components/fd/metrics" @@ -130,7 +131,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E events = append(events, components.Event{ Time: logItem.Time, Name: EventNameErrorVFSFileMaxLimitReached, - Type: components.EventTypeCritical, + Type: common.EventTypeCritical, Message: "VFS file-max limit reached", ExtraInfo: map[string]string{ EventKeyErrorVFSFileMaxLimitReachedUnixSeconds: strconv.FormatInt(logItem.Time.Unix(), 10), diff --git a/components/fuse/component.go b/components/fuse/component.go index f8346d8d..c94d6c29 100644 --- a/components/fuse/component.go +++ b/components/fuse/component.go @@ -10,6 +10,7 @@ import ( "time" "github.com/leptonai/gpud/components" + "github.com/leptonai/gpud/components/common" fuse_id "github.com/leptonai/gpud/components/fuse/id" "github.com/leptonai/gpud/components/fuse/metrics" "github.com/leptonai/gpud/components/fuse/state" @@ -125,7 +126,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E convertedEvents = append(convertedEvents, components.Event{ Time: metav1.Time{Time: time.Unix(event.UnixSeconds, 0).UTC()}, Name: EventNameFuseConnections, - Type: components.EventTypeCritical, + Type: common.EventTypeCritical, Message: strings.Join(msgs, ", "), ExtraInfo: map[string]string{ EventKeyUnixSeconds: strconv.FormatInt(event.UnixSeconds, 10), diff --git a/components/memory/component.go b/components/memory/component.go index 8ff4af24..85e56577 100644 --- a/components/memory/component.go +++ b/components/memory/component.go @@ -9,6 +9,7 @@ import ( "time" "github.com/leptonai/gpud/components" + "github.com/leptonai/gpud/components/common" "github.com/leptonai/gpud/components/dmesg" memory_id "github.com/leptonai/gpud/components/memory/id" "github.com/leptonai/gpud/components/memory/metrics" @@ -138,7 +139,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E events = append(events, components.Event{ Time: ev.Time, Name: name, - Type: components.EventTypeWarning, + Type: common.EventTypeWarning, ExtraInfo: map[string]string{ EventKeyUnixSeconds: strconv.FormatInt(ev.Time.Unix(), 10), EventKeyLogLine: item.Line, diff --git a/components/os/component.go b/components/os/component.go index a8c27fa2..ef644c3e 100644 --- a/components/os/component.go +++ b/components/os/component.go @@ -7,6 +7,7 @@ import ( "time" "github.com/leptonai/gpud/components" + "github.com/leptonai/gpud/components/common" os_id "github.com/leptonai/gpud/components/os/id" "github.com/leptonai/gpud/components/query" "github.com/leptonai/gpud/components/state" @@ -102,7 +103,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E evs = append(evs, components.Event{ Time: metav1.Time{Time: rebootedAt}, Name: "reboot", - Type: components.EventTypeWarning, + Type: common.EventTypeWarning, Message: fmt.Sprintf("system reboot detected (%s)", rebootedAtHumanized), }) } diff --git a/components/pci/component.go b/components/pci/component.go index 647ffcd0..9e708271 100644 --- a/components/pci/component.go +++ b/components/pci/component.go @@ -7,6 +7,7 @@ import ( "time" "github.com/leptonai/gpud/components" + "github.com/leptonai/gpud/components/common" "github.com/leptonai/gpud/components/pci/id" "github.com/leptonai/gpud/components/pci/state" "github.com/leptonai/gpud/components/query" @@ -65,7 +66,7 @@ func (c *component) Events(ctx context.Context, since time.Time) ([]components.E events = append(events, components.Event{ Name: EventNameACSEnabled, Time: metav1.Time{Time: time.Unix(ev.UnixSeconds, 0)}, - Type: components.EventTypeWarning, + Type: common.EventTypeWarning, Message: strings.Join(ev.Reasons, ", "), }) }