diff --git a/cmd/gpud/command/command.go b/cmd/gpud/command/command.go index 3024bc8a..067a5c70 100644 --- a/cmd/gpud/command/command.go +++ b/cmd/gpud/command/command.go @@ -473,27 +473,6 @@ sudo rm /etc/systemd/system/gpud.service }, }, }, - { - Name: "logs", - Aliases: []string{"log", "l"}, - - Usage: "checks the gpud logs", - Action: cmdLogs, - Flags: []cli.Flag{ - &cli.IntFlag{ - Name: "lines,n", - Usage: "set the number to tail logs", - Destination: &tailLines, - Value: 100, - }, - &cli.StringFlag{ - Name: "log-file", - Usage: "set the log file path (set empty to stdout/stderr)", - Destination: &logFile, - Value: "", - }, - }, - }, { Name: "is-nvidia", diff --git a/cmd/gpud/command/logs.go b/cmd/gpud/command/logs.go deleted file mode 100644 index 384f9756..00000000 --- a/cmd/gpud/command/logs.go +++ /dev/null @@ -1,44 +0,0 @@ -package command - -import ( - "context" - "fmt" - "os" - "time" - - "github.com/leptonai/gpud/pkg/query/log/tail" - - "github.com/urfave/cli" -) - -func cmdLogs(cliContext *cli.Context) error { - if _, err := os.Stat(logFile); err != nil { - return fmt.Errorf("log file %s does not exist", logFile) - } - - rootCtx, rootCancel := context.WithTimeout(context.Background(), 2*time.Minute) - defer rootCancel() - - fmt.Printf("%s tailing %d lines\n\n", inProgress, tailLines) - - lines := make([]string, 0, tailLines) - _, err := tail.Scan( - rootCtx, - tail.WithDedup(true), - tail.WithFile(logFile), - tail.WithLinesToTail(tailLines), - tail.WithPerLineFunc(func(line []byte) { - lines = append(lines, string(line)) - }), - ) - if err != nil { - return fmt.Errorf("failed to tail log file: %w", err) - } - - // print in reverse order (last line is the latest) - for i := len(lines) - 1; i >= 0; i-- { - fmt.Println(lines[i]) - } - - return nil -} diff --git a/cmd/gpud/command/up.go b/cmd/gpud/command/up.go index 0cd353d1..b33c3d94 100644 --- a/cmd/gpud/command/up.go +++ b/cmd/gpud/command/up.go @@ -58,7 +58,7 @@ func cmdUp(cliContext *cli.Context) (retErr error) { return err } - fmt.Printf("%s successfully started gpud (run 'gpud status' or 'gpud logs' for checking status)\n", checkMark) + fmt.Printf("%s successfully started gpud (run 'gpud status' for checking status)\n", checkMark) return nil } diff --git a/go.mod b/go.mod index e5c7dd4a..24d4c943 100644 --- a/go.mod +++ b/go.mod @@ -21,7 +21,6 @@ require ( github.com/hdevalence/ed25519consensus v0.2.0 github.com/mattn/go-sqlite3 v1.14.25-0.20241209043634-7658c06970ec github.com/mitchellh/go-homedir v1.1.0 - github.com/nxadm/tail v1.4.11 github.com/olekukonko/tablewriter v0.0.5 github.com/onsi/ginkgo/v2 v2.21.0 github.com/onsi/gomega v1.35.1 @@ -43,7 +42,6 @@ require ( k8s.io/apimachinery v0.32.0 k8s.io/client-go v0.29.1 k8s.io/cri-api v0.32.0 - k8s.io/utils v0.0.0-20241210054802-24370beab758 sigs.k8s.io/yaml v1.4.0 tailscale.com v1.68.2 ) @@ -164,10 +162,10 @@ require ( google.golang.org/genproto/googleapis/rpc v0.0.0-20250106144421-5f5ef82da422 // indirect google.golang.org/protobuf v1.36.1 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect gotest.tools/v3 v3.5.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect + k8s.io/utils v0.0.0-20241210054802-24370beab758 // indirect nhooyr.io/websocket v1.8.10 // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect diff --git a/go.sum b/go.sum index d017d3fc..288057e5 100644 --- a/go.sum +++ b/go.sum @@ -98,7 +98,6 @@ github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= -github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M= github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= @@ -241,8 +240,6 @@ github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7P github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/nxadm/tail v1.4.11 h1:8feyoE3OzPrcshW5/MJ4sGESc5cqmGkGCWlco4l0bqY= -github.com/nxadm/tail v1.4.11/go.mod h1:OTaG3NK980DZzxbRq6lEuzgU+mug70nY11sMd4JXXHc= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM= @@ -407,7 +404,6 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220817070843-5a390386f1f2/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.4.1-0.20230131160137-e7d7f63158de/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -458,8 +454,6 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= -gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= -gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= diff --git a/pkg/diagnose/diagnose.go b/pkg/diagnose/diagnose.go index 64a7bed3..87998c93 100644 --- a/pkg/diagnose/diagnose.go +++ b/pkg/diagnose/diagnose.go @@ -11,15 +11,11 @@ import ( "sigs.k8s.io/yaml" - nvidia_component_error_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid" - nvidia_component_error_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid" pkg_dmesg "github.com/leptonai/gpud/pkg/dmesg" "github.com/leptonai/gpud/pkg/host" "github.com/leptonai/gpud/pkg/log" nvidia_query "github.com/leptonai/gpud/pkg/nvidia-query" "github.com/leptonai/gpud/pkg/process" - query_log_common "github.com/leptonai/gpud/pkg/query/log/common" - query_log_tail "github.com/leptonai/gpud/pkg/query/log/tail" pkd_systemd "github.com/leptonai/gpud/pkg/systemd" ) @@ -144,44 +140,16 @@ func run(ctx context.Context, dir string, opts ...OpOption) error { nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx) if nvidiaInstalled && err == nil { fmt.Printf("%s scanning dmesg with regexes\n", inProgress) - matched, err := query_log_tail.Scan( - ctx, - query_log_tail.WithDedup(true), - query_log_tail.WithCommands(pkg_dmesg.DefaultDmesgScanCommands), - query_log_tail.WithLinesToTail(5000), - query_log_tail.WithMatchFunc( - func(line string) (string, string) { - xidErr := nvidia_component_error_xid.Match(line) - if xidErr != nil { - return "xid found", "" - } - return "", "" // no match - }, - func(line string) (string, string) { - sxidErr := nvidia_component_error_sxid.Match(line) - if sxidErr != nil { - return "sxid found", "" - } - return "", "" // no match - }, - ), - query_log_tail.WithExtractTime(func(l []byte) (time.Time, []byte, error) { - dm := pkg_dmesg.ParseDmesgLine(string(l)) - return dm.Timestamp, l, nil - }), - query_log_tail.WithProcessMatched(func(time time.Time, line []byte, matched *query_log_common.Filter) { - o.CheckSummary = append(o.CheckSummary, fmt.Sprintf("dmesg match: %s", string(line))) - }), - ) + issueCnt, err := scanDmesg(ctx) if err != nil { o.Results = append(o.Results, CommandResult{ Command: strings.Join(pkg_dmesg.DefaultDmesgScanCommands[0], " "), Error: err.Error(), }) - } else if matched == 0 { + } else if issueCnt == 0 { o.CheckSummary = append(o.CheckSummary, "dmesg scan passed") } else { - o.CheckSummary = append(o.CheckSummary, fmt.Sprintf("dmesg scan detected %d issues", matched)) + o.CheckSummary = append(o.CheckSummary, fmt.Sprintf("dmesg scan detected %d issues", issueCnt)) } } diff --git a/pkg/diagnose/scan.go b/pkg/diagnose/scan.go index 58d91e91..7700d855 100644 --- a/pkg/diagnose/scan.go +++ b/pkg/diagnose/scan.go @@ -8,8 +8,8 @@ import ( "runtime" "time" - nvidia_component_error_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid" - nvidia_component_error_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid" + nvidia_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid" + nvidia_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid" nvidia_component_error_xid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/id" nvidia_hw_slowdown_id "github.com/leptonai/gpud/components/accelerator/nvidia/hw-slowdown/id" "github.com/leptonai/gpud/pkg/disk" @@ -24,8 +24,6 @@ import ( "github.com/leptonai/gpud/pkg/nvidia-query/infiniband" nvidia_query_nvml "github.com/leptonai/gpud/pkg/nvidia-query/nvml" "github.com/leptonai/gpud/pkg/process" - query_log_common "github.com/leptonai/gpud/pkg/query/log/common" - query_log_tail "github.com/leptonai/gpud/pkg/query/log/tail" "github.com/leptonai/gpud/pkg/sqlite" "github.com/dustin/go-humanize" @@ -227,52 +225,14 @@ func Scan(ctx context.Context, opts ...OpOption) error { } fmt.Printf("%s scanning dmesg for %d lines\n", inProgress, op.lines) - matched, err := query_log_tail.Scan( - ctx, - query_log_tail.WithDedup(true), - query_log_tail.WithCommands(pkg_dmesg.DefaultDmesgScanCommands), - query_log_tail.WithLinesToTail(op.lines), - query_log_tail.WithMatchFunc( - func(line string) (string, string) { - xidErr := nvidia_component_error_xid.Match(line) - if xidErr != nil { - return "xid found", "" - } - return "", "" // no match - }, - func(line string) (string, string) { - sxidErr := nvidia_component_error_sxid.Match(line) - if sxidErr != nil { - return "sxid found", "" - } - return "", "" // no match - }, - ), - query_log_tail.WithExtractTime(func(l []byte) (time.Time, []byte, error) { - dm := pkg_dmesg.ParseDmesgLine(string(l)) - return dm.Timestamp, l, nil - }), - query_log_tail.WithProcessMatched(func(time time.Time, line []byte, matched *query_log_common.Filter) { - if xidErr := nvidia_component_error_xid.Match(string(line)); xidErr != nil { - log.Logger.Warnw("known xid", "line", string(line)) - yb, _ := xidErr.YAML() - fmt.Println(string(yb)) - } - - if sxidErr := nvidia_component_error_sxid.Match(string(line)); sxidErr != nil { - log.Logger.Warnw("known sxid", "line", string(line)) - yb, _ := sxidErr.YAML() - fmt.Println(string(yb)) - } - }), - ) + issueCnt, err := scanDmesg(ctx) if err != nil { return err } - if matched == 0 { + if issueCnt == 0 { fmt.Printf("%s scanned dmesg file -- found no issue\n", checkMark) } else { - fmt.Printf("%s scanned dmesg file -- found %d issue(s)\n", warningSign, matched) + fmt.Printf("%s scanned dmesg file -- found %d issue(s)\n", warningSign, issueCnt) } } @@ -324,3 +284,59 @@ func Scan(ctx context.Context, opts ...OpOption) error { fmt.Printf("\n\n%s scan complete\n\n", checkMark) return nil } + +func scanDmesg(ctx context.Context) (int, error) { + p, err := process.New( + process.WithCommands(pkg_dmesg.DefaultDmesgScanCommands), + process.WithRunAsBashScript(), + ) + if err != nil { + return 0, err + } + if err := p.Start(ctx); err != nil { + return 0, err + } + defer func() { + if err := p.Close(ctx); err != nil { + log.Logger.Warnw("failed to abort command", "err", err) + } + }() + + nowUTC := time.Now().UTC() + issueCnt := 0 + if err := process.Read( + ctx, + p, + process.WithReadStdout(), + process.WithReadStderr(), + process.WithProcessLine(func(line string) { + parsed := pkg_dmesg.ParseDmesgLine(line) + ts := humanize.RelTime(parsed.Timestamp, nowUTC, "ago", "from now") + + if found := nvidia_xid.Match(line); found != nil { + fmt.Printf("[XID found] (%s) %q\n", ts, parsed.Content) + issueCnt++ + return + } + + if found := nvidia_sxid.Match(line); found != nil { + fmt.Printf("[SXID found] (%s) %q\n", ts, parsed.Content) + issueCnt++ + return + } + }), + ); err != nil { + return 0, err + } + + select { + case <-ctx.Done(): + return 0, ctx.Err() + case err := <-p.Wait(): + if err != nil { + return 0, err + } + } + + return issueCnt, nil +} diff --git a/pkg/nvidia-query/sxid/dmesg.go b/pkg/nvidia-query/sxid/dmesg.go deleted file mode 100644 index b9fb3ea3..00000000 --- a/pkg/nvidia-query/sxid/dmesg.go +++ /dev/null @@ -1,101 +0,0 @@ -package sxid - -import ( - "encoding/json" - "regexp" - "strconv" - - query_log "github.com/leptonai/gpud/pkg/query/log" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/yaml" -) - -const ( - // e.g., - // [111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First) - // [131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up - // - // ref. - // "D.4 Non-Fatal NVSwitch SXid Errors" - // https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf - RegexNVSwitchSXidDmesg = `SXid.*?: (\d+),` - - // Regex to extract PCI device ID from NVSwitch SXid messages - RegexNVSwitchSXidDeviceUUID = `SXid \((PCI:[0-9a-fA-F:\.]+)\)` -) - -var ( - CompiledRegexNVSwitchSXidDmesg = regexp.MustCompile(RegexNVSwitchSXidDmesg) - CompiledRegexNVSwitchSXidDeviceUUID = regexp.MustCompile(RegexNVSwitchSXidDeviceUUID) -) - -// Extracts the nvidia NVSwitch SXid error code from the dmesg log line. -// Returns 0 if the error code is not found. -// https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf -func ExtractNVSwitchSXid(line string) int { - if match := CompiledRegexNVSwitchSXidDmesg.FindStringSubmatch(line); match != nil { - if id, err := strconv.Atoi(match[1]); err == nil { - return id - } - } - return 0 -} - -// ExtractNVSwitchSXidDeviceUUID extracts the PCI device ID from the dmesg log line. -// Returns empty string if the device ID is not found. -func ExtractNVSwitchSXidDeviceUUID(line string) string { - if match := CompiledRegexNVSwitchSXidDeviceUUID.FindStringSubmatch(line); match != nil { - return match[1] - } - return "" -} - -type DmesgError struct { - DeviceUUID string `json:"device_uuid"` - Detail *Detail `json:"detail"` - LogItem query_log.Item `json:"log_item"` -} - -func (de *DmesgError) JSON() ([]byte, error) { - return json.Marshal(de) -} - -func (de *DmesgError) YAML() ([]byte, error) { - return yaml.Marshal(de) -} - -func ParseDmesgErrorJSON(data []byte) (*DmesgError, error) { - de := new(DmesgError) - if err := json.Unmarshal(data, de); err != nil { - return nil, err - } - return de, nil -} - -func ParseDmesgErrorYAML(data []byte) (*DmesgError, error) { - de := new(DmesgError) - if err := yaml.Unmarshal(data, de); err != nil { - return nil, err - } - return de, nil -} - -func ParseDmesgLogLine(time metav1.Time, line string) (DmesgError, error) { - de := DmesgError{ - DeviceUUID: ExtractNVSwitchSXidDeviceUUID(line), - LogItem: query_log.Item{ - Line: line, - Matched: nil, - Time: time, - }, - } - - errCode := ExtractNVSwitchSXid(line) - errDetail, ok := GetDetail(errCode) - if ok { - de.Detail = errDetail - } - - return de, nil -} diff --git a/pkg/nvidia-query/sxid/dmesg_test.go b/pkg/nvidia-query/sxid/dmesg_test.go deleted file mode 100644 index 7ed5756e..00000000 --- a/pkg/nvidia-query/sxid/dmesg_test.go +++ /dev/null @@ -1,113 +0,0 @@ -package sxid - -import ( - "testing" -) - -func TestExtractNVSwitchSXid(t *testing.T) { - t.Parallel() - - tests := []struct { - name string - input string - expected int - }{ - { - name: "SXid match", - input: "Some log content SXid error: 31, other info", - expected: 31, - }, - { - name: "No match", - input: "Regular log content without Xid errors", - expected: 0, - }, - { - name: "SXid with non-numeric value", - input: "SXid error: abc, invalid data", - expected: 0, - }, - { - name: "error example", - input: "[111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First)", - expected: 12028, - }, - { - // ref. https://access.redhat.com/solutions/6619941 - name: "error example", - input: "[131453.740743] nvidia-nvswitch0: SXid (PCI:0000:a9:00.0): 20034, Fatal, Link 30 LTSSM Fault Up", - expected: 20034, - }, - { - // ref. https://access.redhat.com/solutions/6619941 - name: "error example", - input: "[131453.740754] nvidia-nvswitch0: SXid (PCI:0000:a9:00.0): 20034, Severity 1 Engine instance 30 Sub-engine instance 00", - expected: 20034, - }, - { - // ref. https://access.redhat.com/solutions/6619941 - name: "error example", - input: "[131453.740758] nvidia-nvswitch0: SXid (PCI:0000:a9:00.0): 20034, Data {0x50610002, 0x10100030, 0x00000000, 0x10100030, 0x00000000, 0x00000000, 0x00000000, 0x00000000}p", - expected: 20034, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := ExtractNVSwitchSXid(tt.input) - if result != tt.expected { - t.Errorf("ExtractNVSwitchSXid(%q) = %d, want %d", tt.input, result, tt.expected) - } - }) - } -} - -func TestExtractNVSwitchSXidDeviceID(t *testing.T) { - t.Parallel() - - tests := []struct { - name string - input string - expected string - }{ - { - name: "valid device ID", - input: "[111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First)", - expected: "PCI:0000:05:00.0", - }, - { - name: "another valid device ID", - input: "[131453.740743] nvidia-nvswitch0: SXid (PCI:0000:a9:00.0): 20034, Fatal, Link 30 LTSSM Fault Up", - expected: "PCI:0000:a9:00.0", - }, - { - name: "valid device ID without timestamp", - input: "nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First)", - expected: "PCI:0000:05:00.0", - }, - { - name: "another valid device ID without timestamp", - input: "nvidia-nvswitch0: SXid (PCI:0000:a9:00.0): 20034, Fatal, Link 30 LTSSM Fault Up", - expected: "PCI:0000:a9:00.0", - }, - { - name: "no device ID", - input: "Regular log content without SXid", - expected: "", - }, - { - name: "malformed device ID", - input: "SXid (PCI:invalid): some error", - expected: "", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := ExtractNVSwitchSXidDeviceUUID(tt.input) - if result != tt.expected { - t.Errorf("ExtractNVSwitchSXidDeviceID(%q) = %q, want %q", tt.input, result, tt.expected) - } - }) - } -} diff --git a/pkg/nvidia-query/xid/dmesg.go b/pkg/nvidia-query/xid/dmesg.go deleted file mode 100644 index ce0b654c..00000000 --- a/pkg/nvidia-query/xid/dmesg.go +++ /dev/null @@ -1,104 +0,0 @@ -package xid - -import ( - "encoding/json" - "regexp" - "strconv" - - query_log "github.com/leptonai/gpud/pkg/query/log" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/yaml" -) - -const ( - // e.g., - // [...] NVRM: Xid (0000:03:00): 14, Channel 00000001 - // [...] NVRM: Xid (PCI:0000:05:00): 79, pid='', name=, GPU has fallen off the bus. - // NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus. - // - // ref. - // https://docs.nvidia.com/deploy/pdf/XID_Errors.pdf - RegexNVRMXidDmesg = `NVRM: Xid.*?: (\d+),` - - // Regex to extract PCI device ID from NVRM Xid messages - // Matches both formats: (0000:03:00) and (PCI:0000:05:00) - RegexNVRMXidDeviceUUID = `NVRM: Xid \(((?:PCI:)?[0-9a-fA-F:]+)\)` -) - -var ( - CompiledRegexNVRMXidDmesg = regexp.MustCompile(RegexNVRMXidDmesg) - CompiledRegexNVRMXidDeviceUUID = regexp.MustCompile(RegexNVRMXidDeviceUUID) -) - -// Extracts the nvidia Xid error code from the dmesg log line. -// Returns 0 if the error code is not found. -// https://docs.nvidia.com/deploy/pdf/XID_Errors.pdf -func ExtractNVRMXid(line string) int { - if match := CompiledRegexNVRMXidDmesg.FindStringSubmatch(line); match != nil { - if id, err := strconv.Atoi(match[1]); err == nil { - return id - } - } - return 0 -} - -// ExtractNVRMXidDeviceUUID extracts the PCI device ID from the NVRM Xid dmesg log line. -// For input without "PCI:" prefix, it returns the ID as is. -// For input with "PCI:" prefix, it returns the full ID including the prefix. -// Returns empty string if the device ID is not found. -func ExtractNVRMXidDeviceUUID(line string) string { - if match := CompiledRegexNVRMXidDeviceUUID.FindStringSubmatch(line); match != nil { - return match[1] - } - return "" -} - -type DmesgError struct { - DeviceUUID string `json:"device_uuid"` - Detail *Detail `json:"detail"` - LogItem query_log.Item `json:"log_item"` -} - -func (de *DmesgError) JSON() ([]byte, error) { - return json.Marshal(de) -} - -func (de *DmesgError) YAML() ([]byte, error) { - return yaml.Marshal(de) -} - -func ParseDmesgErrorJSON(data []byte) (*DmesgError, error) { - de := new(DmesgError) - if err := json.Unmarshal(data, de); err != nil { - return nil, err - } - return de, nil -} - -func ParseDmesgErrorYAML(data []byte) (*DmesgError, error) { - de := new(DmesgError) - if err := yaml.Unmarshal(data, de); err != nil { - return nil, err - } - return de, nil -} - -func ParseDmesgLogLine(time metav1.Time, line string) (DmesgError, error) { - de := DmesgError{ - DeviceUUID: ExtractNVRMXidDeviceUUID(line), - LogItem: query_log.Item{ - Line: line, - Matched: nil, - Time: time, - }, - } - - errCode := ExtractNVRMXid(line) - errDetail, ok := GetDetail(errCode) - if ok { - de.Detail = errDetail - } - - return de, nil -} diff --git a/pkg/nvidia-query/xid/dmesg_test.go b/pkg/nvidia-query/xid/dmesg_test.go deleted file mode 100644 index 7f135138..00000000 --- a/pkg/nvidia-query/xid/dmesg_test.go +++ /dev/null @@ -1,150 +0,0 @@ -package xid - -import "testing" - -func TestExtractNVRMXid(t *testing.T) { - t.Parallel() - - tests := []struct { - name string - input string - expected int - }{ - { - name: "NVRM Xid match", - input: "NVRM: Xid critical error: 79, details follow", - expected: 79, - }, - { - name: "No match", - input: "Regular log content without Xid errors", - expected: 0, - }, - { - name: "NVRM Xid with non-numeric value", - input: "NVRM: Xid error: xyz, invalid data", - expected: 0, - }, - { - name: "error example", - input: "[111111111.111] NVRM: Xid (PCI:0000:05:00): 79, pid='', name=, GPU has fallen off the bus.", - expected: 79, - }, - { - name: "error example", - input: "NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus.", - expected: 79, - }, - { - name: "error example", - input: "[...] NVRM: Xid (0000:03:00): 14, Channel 00000001", - expected: 14, - }, - - // ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#id3 - { - name: "Contained error with MIG enabled", - input: "NVRM: Xid (PCI:0000:01:00 GPU-I:05): 94, pid=7194, Contained: CE User Channel (0x9). RST: No, D-RST: No", - expected: 94, - }, - { - name: "Contained error with MIG disabled", - input: "NVRM: Xid (PCI:0000:01:00): 94, pid=7062, Contained: CE User Channel (0x9). RST: No, D-RST: No", - expected: 94, - }, - { - name: "Uncontained error", - input: "NVRM: Xid (PCI:0000:01:00): 95, pid=7062, Uncontained: LTC TAG (0x2,0x0). RST: Yes, D-RST: No", - expected: 95, - }, - { - name: "GSP RPC timeout error", - input: "NVRM: Xid (PCI:0000:19:00): 119, pid=452531, name=cache_mgr_main, Timeout after 6s of waiting for RPC response from GPU0 GSP! Expected function 76 (GSP_RM_CONTROL) (0x20809004 0x608).", - expected: 119, - }, - { - name: "GSP RPC timeout error with different process", - input: "NVRM: Xid (PCI:0000:19:00): 119, pid=3116150, name=cache_mgr_main, Timeout after 6s of waiting for RPC response from GPU0 GSP! Expected function 76 (GSP_RM_CONTROL) (0x20800a4c 0x4).", - expected: 119, - }, - { - name: "GSP RPC timeout error with nvidia-smi", - input: "NVRM: Xid (PCI:0000:19:00): 119, pid=2485486, name=nvidia-smi, Timeout after 6s of waiting for RPC response from GPU0 GSP! Expected function 76 (GSP_RM_CONTROL) (0x20803039 0x20).", - expected: 119, - }, - { - name: "GSP RPC timeout error after 45s", - input: "NVRM: Xid (PCI:0000:19:00): 119, pid=289957, name=pt_main_thread, Timeout after 45s of waiting for RPC response from GPU0 GSP! Expected function 76 (GSP_RM_CONTROL) (0xa06f0112 0x1).", - expected: 119, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := ExtractNVRMXid(tt.input) - if result != tt.expected { - t.Errorf("ExtractNVRMXid(%q) = %d, want %d", tt.input, result, tt.expected) - } - }) - } -} - -func TestExtractNVRMXidDeviceID(t *testing.T) { - t.Parallel() - - tests := []struct { - name string - input string - expected string - }{ - { - name: "device ID without PCI prefix", - input: "[...] NVRM: Xid (0000:03:00): 14, Channel 00000001", - expected: "0000:03:00", - }, - { - name: "device ID with PCI prefix", - input: "[...] NVRM: Xid (PCI:0000:05:00): 79, pid='', name=, GPU has fallen off the bus.", - expected: "PCI:0000:05:00", - }, - { - name: "device ID without PCI prefix without timestamp", - input: "NVRM: Xid (0000:03:00): 14, Channel 00000001", - expected: "0000:03:00", - }, - { - name: "device ID with PCI prefix without timestamp #1", - input: "NVRM: Xid (PCI:0000:05:00): 79, pid='', name=, GPU has fallen off the bus.", - expected: "PCI:0000:05:00", - }, - { - name: "device ID with PCI prefix without timestamp #2", - input: "NVRM: Xid (PCI:0000:40:00): 119, pid=2272442, name=python, Timeout after 45s of waiting for RPC response from GPU3 GSP! Expected function 76 (GSP_RM_CONTROL) (0x20801702 0x4).", - expected: "PCI:0000:40:00", - }, - { - name: "another device ID with PCI prefix", - input: "NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus.", - expected: "PCI:0000:01:00", - }, - { - name: "no device ID", - input: "Regular log content without Xid", - expected: "", - }, - { - name: "malformed device ID", - input: "NVRM: Xid (invalid): some error", - expected: "", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := ExtractNVRMXidDeviceUUID(tt.input) - if result != tt.expected { - t.Errorf("ExtractNVRMXidDeviceID(%q) = %q, want %q", tt.input, result, tt.expected) - } - }) - } -} diff --git a/pkg/query/log/common/common.go b/pkg/query/log/common/common.go deleted file mode 100644 index 6abd55cb..00000000 --- a/pkg/query/log/common/common.go +++ /dev/null @@ -1,10 +0,0 @@ -// Package common provides the common log components. -package common - -import ( - "time" -) - -type ExtractTimeFunc func([]byte) (time.Time, []byte, error) - -type ProcessMatchedFunc func(parsedTime time.Time, line []byte, filter *Filter) diff --git a/pkg/query/log/common/filter.go b/pkg/query/log/common/filter.go deleted file mode 100644 index 5e2df9a0..00000000 --- a/pkg/query/log/common/filter.go +++ /dev/null @@ -1,101 +0,0 @@ -package common - -import ( - "bytes" - "encoding/json" - "regexp" - "strings" - - "sigs.k8s.io/yaml" -) - -type Filter struct { - Name string `json:"name"` - - Substring *string `json:"substring,omitempty"` - - Regex *string `json:"regex,omitempty"` - regex *regexp.Regexp `json:"-"` - - // OwnerReferences is a list of component names that watches on this filter. - // Useful when multiple components watch on the same log file. - // e.g., if the component X and Y both watch on the same log file, - // with the same filter rule (substring/regex), this field will be - // set to [x, y]. - OwnerReferences []string `json:"owner_references,omitempty"` -} - -func (f *Filter) JSON() ([]byte, error) { - return json.Marshal(f) -} - -func ParseFilterJSON(data []byte) (*Filter, error) { - f := new(Filter) - if err := json.Unmarshal(data, f); err != nil { - return nil, err - } - return f, nil -} - -func (f *Filter) YAML() ([]byte, error) { - return yaml.Marshal(f) -} - -func ParseFilterYAML(data []byte) (*Filter, error) { - f := new(Filter) - err := yaml.Unmarshal(data, f) - if err != nil { - return nil, err - } - return f, nil -} - -// Compiles the regex, if set. -func (f *Filter) Compile() error { - if f.Regex != nil { - rgx, err := regexp.Compile(*f.Regex) - if err != nil { - return err - } - f.regex = rgx - } - return nil -} - -func (f *Filter) MatchString(line string) (bool, error) { - if f.Regex != nil && f.regex == nil { - if err := f.Compile(); err != nil { - return false, err - } - } - return f.matchString(line), nil -} - -func (f *Filter) MatchBytes(line []byte) (bool, error) { - if f.Regex != nil && f.regex == nil { - if err := f.Compile(); err != nil { - return false, err - } - } - return f.matchBytes(line), nil -} - -func (f *Filter) matchString(line string) bool { - if f.Substring != nil && strings.Contains(line, *f.Substring) { - return true - } - if f.regex != nil && f.regex.MatchString(line) { - return true - } - return false -} - -func (f *Filter) matchBytes(line []byte) bool { - if f.Substring != nil && bytes.Contains(line, []byte(*f.Substring)) { - return true - } - if f.regex != nil && f.regex.Match(line) { - return true - } - return false -} diff --git a/pkg/query/log/common/match.go b/pkg/query/log/common/match.go deleted file mode 100644 index 6e493a02..00000000 --- a/pkg/query/log/common/match.go +++ /dev/null @@ -1,5 +0,0 @@ -package common - -// MatchFunc is a function that matches a line of log and returns a Match. -// It returns empty strings if the line does not match. -type MatchFunc func(line string) (eventName string, message string) diff --git a/pkg/query/log/config/config.go b/pkg/query/log/config/config.go deleted file mode 100644 index 1d7be34b..00000000 --- a/pkg/query/log/config/config.go +++ /dev/null @@ -1,96 +0,0 @@ -// Package config provides the log poller configuration. -package config - -import ( - "context" - "encoding/json" - "errors" - - query_config "github.com/leptonai/gpud/pkg/query/config" - query_log_common "github.com/leptonai/gpud/pkg/query/log/common" - - "github.com/nxadm/tail" -) - -const DefaultBufferSize = 2000 - -type Config struct { - Query query_config.Config `json:"query"` - - BufferSize int `json:"buffer_size"` - - File string `json:"file"` - Commands [][]string `json:"commands"` - - // For each interval, execute the scanning operation - // based on the following config (rather than polling). - // This is to backtrack the old log messages. - Scan *Scan `json:"scan,omitempty"` - - // "OR" conditions to select logs. - // An event is generated if any of the filters match. - // Useful for explicit blacklisting "error" logs - // (e.g., GPU error messages in dmesg). - SelectFilters []*query_log_common.Filter `json:"select_filters"` - // "AND" conditions to select logs. - // An event is generated if all of the filters do not match. - // Useful for explicit whitelisting logs and catch all other - // (e.g., good healthy log messages). - RejectFilters []*query_log_common.Filter `json:"reject_filters"` - - SeekInfo *tail.SeekInfo `json:"seek_info,omitempty"` - - // Used to commit the last seek info to disk. - SeekInfoSyncer func(ctx context.Context, file string, seekInfo tail.SeekInfo) `json:"-"` - - // Parse time format - TimeParseFunc query_log_common.ExtractTimeFunc `json:"-"` -} - -// For each interval, execute the scanning operation -// based on the following config (rather than polling). -// This is to backtrack the old log messages. -type Scan struct { - File string `json:"file"` - Commands [][]string `json:"commands"` - LinesToTail int `json:"lines_to_tail"` -} - -func (cfg *Config) Validate() error { - if cfg.File == "" && len(cfg.Commands) == 0 { - return errors.New("file or commands must be set") - } - if cfg.Scan != nil { - if cfg.Scan.File == "" && len(cfg.Scan.Commands) == 0 { - return errors.New("file or commands must be set for scan") - } - } - if len(cfg.SelectFilters) > 0 && len(cfg.RejectFilters) > 0 { - return errors.New("cannot have both select and reject filters") - } - return nil -} - -func (cfg *Config) SetDefaultsIfNotSet() { - cfg.Query.SetDefaultsIfNotSet() - - if cfg.BufferSize == 0 { - cfg.BufferSize = DefaultBufferSize - } - if cfg.Query.QueueSize < cfg.BufferSize { - cfg.Query.QueueSize = cfg.BufferSize - } -} - -func ParseConfig(b any) (*Config, error) { - raw, err := json.Marshal(b) - if err != nil { - return nil, err - } - cfg := new(Config) - err = json.Unmarshal(raw, cfg) - if err != nil { - return nil, err - } - return cfg, nil -} diff --git a/pkg/query/log/doc.go b/pkg/query/log/doc.go deleted file mode 100644 index a866f472..00000000 --- a/pkg/query/log/doc.go +++ /dev/null @@ -1,2 +0,0 @@ -// Package log provides the log file/output poller implementation. -package log diff --git a/pkg/query/log/poller.go b/pkg/query/log/poller.go deleted file mode 100644 index a7bd2937..00000000 --- a/pkg/query/log/poller.go +++ /dev/null @@ -1,302 +0,0 @@ -package log - -import ( - "context" - "encoding/json" - "fmt" - "strings" - "sync" - "time" - - "github.com/leptonai/gpud/pkg/log" - "github.com/leptonai/gpud/pkg/query" - query_log_common "github.com/leptonai/gpud/pkg/query/log/common" - query_log_config "github.com/leptonai/gpud/pkg/query/log/config" - query_log_tail "github.com/leptonai/gpud/pkg/query/log/tail" - - "github.com/nxadm/tail" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -var _ query.Poller = (*poller)(nil) - -var _ Poller = (*poller)(nil) - -// Poller implements the log file poller. -// The underlying poller is a tail.Tail but with poll mode enabled. -// Poll is better when there are multiple consumers (e.g., multiple log tailers) -// reading from the same file. -type Poller interface { - query.Poller - - // Config returns the config used to start the log poller. - // This is useful for debugging and logging. - LogConfig() query_log_config.Config - - // Returns the file name that this poller watches on. - File() string - // Returns the commands that this poller is running. - Commands() [][]string - - // Returns all the events for the given "since" time. - // If none, it returns all events that are already filtered - // by the default filters in the configuration. - // Returns `github.com/leptonai/gpud/pkg/query.ErrNoData` if there is no event found. - Find(since time.Time, selectFilters ...*query_log_common.Filter) ([]Item, error) - - // Returns the last seek info. - SeekInfo() tail.SeekInfo -} - -// Item is the basic unit of data that poller returns. -// If enabled, each result is persisted in the storage. -// It is converted from the underlying query poller. -type Item struct { - Time metav1.Time `json:"time"` - Line string `json:"line"` - - // Matched filter that was applied to this item/line. - Matched *query_log_common.Filter `json:"matched,omitempty"` - - Error *string `json:"error,omitempty"` -} - -func (item Item) JSON() ([]byte, error) { - return json.Marshal(item) -} - -func ParseItemJSON(data []byte) (Item, error) { - item := Item{} - if err := json.Unmarshal(data, &item); err != nil { - return Item{}, err - } - if item.Matched != nil && item.Matched.Regex != nil { - if err := item.Matched.Compile(); err != nil { - return Item{}, err - } - } - return item, nil -} - -type Items []Item - -type poller struct { - query.Poller - - cfg query_log_config.Config - - tailLogger query_log_tail.Streamer - - tailFileSeekInfoMu sync.RWMutex - tailFileSeekInfo tail.SeekInfo - tailFileSeekInfoSyncer func(ctx context.Context, file string, seekInfo tail.SeekInfo) - - bufferedItemsMu sync.RWMutex - bufferedItems []Item -} - -func New(ctx context.Context, cfg query_log_config.Config, extractTime query_log_common.ExtractTimeFunc, processMatched query_log_common.ProcessMatchedFunc) (Poller, error) { - return newPoller(ctx, cfg, extractTime, processMatched) -} - -func newPoller(ctx context.Context, cfg query_log_config.Config, extractTime query_log_common.ExtractTimeFunc, processMatched query_log_common.ProcessMatchedFunc) (*poller, error) { - if err := cfg.Validate(); err != nil { - return nil, err - } - cfg.SetDefaultsIfNotSet() - - options := []query_log_tail.OpOption{ - query_log_tail.WithDedup(true), - query_log_tail.WithSelectFilter(cfg.SelectFilters...), - query_log_tail.WithRejectFilter(cfg.RejectFilters...), - query_log_tail.WithExtractTime(extractTime), - query_log_tail.WithProcessMatched(processMatched), - query_log_tail.WithSkipEmptyLine(true), - } - - if cfg.File != "" { - options = append(options, query_log_tail.WithLabel("file", cfg.File)) - } else { - for i, cmds := range cfg.Commands { - options = append(options, query_log_tail.WithLabel(fmt.Sprintf("command-%d", i+1), strings.Join(cmds, " "))) - } - } - - var tailLogger query_log_tail.Streamer - var err error - if cfg.File != "" { - tailLogger, err = query_log_tail.NewFromFile(ctx, cfg.File, cfg.SeekInfo, options...) - } else { - tailLogger, err = query_log_tail.NewFromCommand(ctx, cfg.Commands, options...) - } - if err != nil { - return nil, err - } - - pl := &poller{ - cfg: cfg, - tailLogger: tailLogger, - tailFileSeekInfoSyncer: cfg.SeekInfoSyncer, - bufferedItems: make([]Item, 0, cfg.BufferSize), - } - go pl.pollSync(ctx) - - flushFunc := func(ctx context.Context) (any, error) { - pl.bufferedItemsMu.Lock() - defer pl.bufferedItemsMu.Unlock() - copied := make([]Item, len(pl.bufferedItems)) - copy(copied, pl.bufferedItems) - pl.bufferedItems = pl.bufferedItems[:0] - return copied, nil - } - - name := cfg.File - if name == "" { - for _, args := range cfg.Commands { - if name != "" { - name += ", " - } - name += strings.Join(args, " ") - } - } - - pl.Poller = query.New( - name, - cfg.Query, - flushFunc, - nil, - ) - - return pl, nil -} - -// pollSync polls the log tail from the specified file or long-running commands -// and syncs the items to the buffered items. -// This only catches the realtime/latest and all the future logs. -func (pl *poller) pollSync(ctx context.Context) { - for line := range pl.tailLogger.Line() { - var errStr *string - if line.Err != nil { - s := line.Err.Error() - errStr = &s - } - - item := Item{ - Time: metav1.Time{Time: line.Time}, - Line: line.Text, - Matched: line.MatchedFilter, - Error: errStr, - } - - pl.bufferedItemsMu.Lock() - pl.bufferedItems = append(pl.bufferedItems, item) - pl.bufferedItemsMu.Unlock() - - pl.tailFileSeekInfoMu.Lock() - pl.tailFileSeekInfo = line.SeekInfo - if pl.tailFileSeekInfoSyncer != nil { - pl.tailFileSeekInfoSyncer(ctx, pl.tailLogger.File(), pl.tailFileSeekInfo) - } - pl.tailFileSeekInfoMu.Unlock() - } -} - -func (pl *poller) LogConfig() query_log_config.Config { - return pl.cfg -} - -func (pl *poller) File() string { - return pl.tailLogger.File() -} - -func (pl *poller) Commands() [][]string { - return pl.tailLogger.Commands() -} - -// This only catches the realtime/latest and all the future logs. -// Returns `github.com/leptonai/gpud/pkg/query.ErrNoData` if there is no event found. -func (pl *poller) Find(since time.Time, selectFilters ...*query_log_common.Filter) ([]Item, error) { - // 1. filter the already flushed/in-queue ones - polledItems, err := pl.Poller.All(since) - if err != nil { - return nil, err - } - - items := make([]Item, 0) - for _, item := range polledItems { - if item.Error != nil { - continue - } - if item.Output == nil { - log.Logger.Warnw("polled item has no output (without an error)", "item", item) - continue - } - - itemsFromPollerOutput := item.Output.([]Item) - for _, item := range itemsFromPollerOutput { - if len(selectFilters) == 0 { - items = append(items, item) - continue - } - - var matchedFilter *query_log_common.Filter - for _, f := range selectFilters { - matched, err := f.MatchString(item.Line) - if err != nil { - return nil, err - } - if matched { - matchedFilter = f - break - } - } - - if matchedFilter != nil { - item.Matched = matchedFilter - items = append(items, item) - } - } - } - - pl.bufferedItemsMu.RLock() - defer pl.bufferedItemsMu.RUnlock() - - // 2. filter the buffered ones - // if not empty, buffered ones have not been flushed by the poller - // thus not returned by the poller all events - for _, item := range pl.bufferedItems { - if !since.IsZero() && item.Time.Time.Before(since) { - continue - } - - if len(selectFilters) == 0 { - items = append(items, item) - continue - } - - var matchedFilter *query_log_common.Filter - for _, f := range selectFilters { - matched, err := f.MatchString(item.Line) - if err != nil { - return nil, err - } - if matched { - matchedFilter = f - break - } - } - - if matchedFilter != nil { - item.Matched = matchedFilter - items = append(items, item) - } - } - - return items, nil -} - -func (pl *poller) SeekInfo() tail.SeekInfo { - pl.tailFileSeekInfoMu.RLock() - defer pl.tailFileSeekInfoMu.RUnlock() - return pl.tailFileSeekInfo -} diff --git a/pkg/query/log/poller_test.go b/pkg/query/log/poller_test.go deleted file mode 100644 index 6d11ee30..00000000 --- a/pkg/query/log/poller_test.go +++ /dev/null @@ -1,280 +0,0 @@ -package log - -import ( - "context" - "os" - "testing" - "time" - - "github.com/leptonai/gpud/pkg/query" - query_config "github.com/leptonai/gpud/pkg/query/config" - query_log_common "github.com/leptonai/gpud/pkg/query/log/common" - query_log_config "github.com/leptonai/gpud/pkg/query/log/config" - - "github.com/nxadm/tail" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/utils/ptr" -) - -func TestPoller(t *testing.T) { - t.Parallel() - - cfg := query_log_config.Config{ - File: "tail/testdata/kubelet.0.log", - } - - ctx, cancel := context.WithTimeout(context.Background(), time.Minute) - defer cancel() - - poller, err := newPoller(ctx, cfg, nil, nil) - if err != nil { - t.Fatalf("failed to create log poller: %v", err) - } - defer poller.Stop("test") - - if _, err := poller.Find(time.Now().Add(time.Hour)); err != query.ErrNoData { - t.Fatalf("expected no data, got %v", err) - } - - synced := 0 - - poller.tailFileSeekInfoMu.Lock() - poller.tailFileSeekInfoSyncer = func(_ context.Context, file string, seekInfo tail.SeekInfo) { - synced++ - t.Logf("seek info: %+v", seekInfo) - } - poller.tailFileSeekInfoMu.Unlock() - - poller.Start(ctx, query_config.Config{Interval: metav1.Duration{Duration: time.Second}}, "test") - - time.Sleep(5 * time.Second) - - allItems, err := poller.Find(time.Now().Add(-time.Hour)) - if err != nil { - t.Fatalf("failed to get all items: %v", err) - } - for _, r := range allItems { - t.Log(r.Line) - } - - t.Logf("seek info %+v", poller.SeekInfo()) - - if synced != 20 { // 20 lines - t.Fatalf("expected 20 seek info sync, got %d", synced) - } -} - -func TestPollerTail(t *testing.T) { - t.Parallel() - - f, err := os.CreateTemp(os.TempDir(), "test-log") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(f.Name()) - - cfg := query_log_config.Config{ - File: f.Name(), - } - - ctx, cancel := context.WithTimeout(context.Background(), time.Minute) - defer cancel() - - poller, err := newPoller(ctx, cfg, nil, nil) - if err != nil { - t.Fatalf("failed to create log poller: %v", err) - } - defer poller.Stop("test") - - synced := 0 - poller.tailFileSeekInfoMu.Lock() - poller.tailFileSeekInfoSyncer = func(_ context.Context, file string, seekInfo tail.SeekInfo) { - synced++ - t.Logf("seek info: %+v", seekInfo) - } - poller.tailFileSeekInfoMu.Unlock() - - poller.Start(ctx, query_config.Config{Interval: metav1.Duration{Duration: time.Second}}, "test") - - t.Log("writing 1") - if _, err := f.WriteString("hello1\n"); err != nil { - t.Fatalf("failed to write to temp file: %v", err) - } - if ferr := f.Sync(); ferr != nil { - t.Fatalf("failed to sync temp file: %v", ferr) - } - - t.Log("writing 2") - if _, err := f.WriteString("hello2\n"); err != nil { - t.Fatalf("failed to write to temp file: %v", err) - } - if ferr := f.Sync(); ferr != nil { - t.Fatalf("failed to sync temp file: %v", ferr) - } - - time.Sleep(10 * time.Second) - - allItems, err := poller.Find(time.Now().Add(-time.Hour)) - if err != nil { - t.Fatalf("failed to get all items: %v", err) - } - for _, r := range allItems { - t.Log(r.Line) - } - - t.Logf("seek info %+v", poller.SeekInfo()) - - if synced != 2 { // 2 lines - t.Fatalf("expected 2 seek info sync, got %d", synced) - } -} - -func TestItemJSON(t *testing.T) { - t.Parallel() - - testCases := []struct { - name string - item Item - wantErr bool - wantJSON string // Add expected JSON string for verification - validate func(t *testing.T, got Item) - }{ - { - name: "basic item", - item: Item{ - Time: metav1.Time{Time: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC)}, - Line: "test log line", - }, - wantJSON: `{"time":"2024-01-01T00:00:00Z","line":"test log line"}`, - validate: func(t *testing.T, got Item) { - if got.Line != "test log line" { - t.Errorf("expected line %q, got %q", "test log line", got.Line) - } - if !got.Time.Equal(&metav1.Time{Time: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC)}) { - t.Errorf("expected time %v, got %v", time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), got.Time) - } - }, - }, - { - name: "item with error", - item: Item{ - Time: metav1.Time{Time: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC)}, - Line: "test log line", - Error: ptr.To("test error"), - }, - wantJSON: `{"time":"2024-01-01T00:00:00Z","line":"test log line","error":"test error"}`, - validate: func(t *testing.T, got Item) { - if got.Error == nil || *got.Error != "test error" { - t.Errorf("expected error %q, got %v", "test error", got.Error) - } - }, - }, - { - name: "item with matched filter", - item: Item{ - Time: metav1.Time{Time: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC)}, - Line: "test log line", - Matched: &query_log_common.Filter{ - Name: "test filter", - Regex: ptr.To("test.*"), - }, - }, - wantJSON: `{"time":"2024-01-01T00:00:00Z","line":"test log line","matched":{"name":"test filter","regex":"test.*"}}`, - validate: func(t *testing.T, got Item) { - if got.Matched == nil { - t.Fatal("expected matched filter, got nil") - } - if got.Matched.Name != "test filter" { - t.Errorf("expected filter name %q, got %q", "test filter", got.Matched.Name) - } - if got.Matched.Regex == nil || *got.Matched.Regex != "test.*" { - t.Errorf("expected filter regex %q, got %v", "test.*", got.Matched.Regex) - } - }, - }, - { - name: "item with nil error", - item: Item{ - Time: metav1.Time{Time: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC)}, - Line: "test log line", - Error: nil, - }, - wantJSON: `{"time":"2024-01-01T00:00:00Z","line":"test log line"}`, - validate: func(t *testing.T, got Item) { - if got.Error != nil { - t.Errorf("expected nil error, got %v", got.Error) - } - }, - }, - } - - for _, tc := range testCases { - tc := tc - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - - // Test marshaling - data, err := tc.item.JSON() - if (err != nil) != tc.wantErr { - t.Fatalf("JSON() error = %v, wantErr %v", err, tc.wantErr) - } - if tc.wantErr { - return - } - - // Verify JSON string matches expected - if tc.wantJSON != "" { - if got := string(data); got != tc.wantJSON { - t.Errorf("JSON() = %v, want %v", got, tc.wantJSON) - } - } - - // Test unmarshaling - got, err := ParseItemJSON(data) - if err != nil { - t.Fatalf("ParseItemJSON() error = %v", err) - } - - // Run validation - tc.validate(t, got) - }) - } -} - -func TestParseItemJSONErrors(t *testing.T) { - t.Parallel() - - testCases := []struct { - name string - input string - wantErr bool - }{ - { - name: "invalid json", - input: "invalid json", - wantErr: true, - }, - { - name: "empty json", - input: "{}", - wantErr: false, - }, - { - name: "invalid regex in filter", - input: `{"time":"2024-01-01T00:00:00Z","line":"test","matched":{"regex":"[invalid"}}`, - wantErr: true, - }, - } - - for _, tc := range testCases { - tc := tc - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - - _, err := ParseItemJSON([]byte(tc.input)) - if (err != nil) != tc.wantErr { - t.Errorf("ParseItemJSON() error = %v, wantErr %v", err, tc.wantErr) - } - }) - } -} diff --git a/pkg/query/log/registry.go b/pkg/query/log/registry.go deleted file mode 100644 index 9842c10a..00000000 --- a/pkg/query/log/registry.go +++ /dev/null @@ -1,25 +0,0 @@ -package log - -import "sync" - -var ( - globalPollersMu sync.RWMutex - - // maps the file name to its poller - // assume, the common filters are merged for all components - globalPollers = make(map[string]Poller) -) - -func RegisterPoller(poller Poller) { - globalPollersMu.Lock() - defer globalPollersMu.Unlock() - - globalPollers[poller.File()] = poller -} - -func GetPoller(fileName string) Poller { - globalPollersMu.RLock() - defer globalPollersMu.RUnlock() - - return globalPollers[fileName] -} diff --git a/pkg/query/log/state/state.go b/pkg/query/log/state/state.go deleted file mode 100644 index e6a048ea..00000000 --- a/pkg/query/log/state/state.go +++ /dev/null @@ -1,62 +0,0 @@ -// Package state provides the persistent storage layer for the log poller. -package state - -import ( - "context" - "database/sql" - "fmt" - "time" - - "github.com/leptonai/gpud/pkg/sqlite" -) - -const TableNameLogFileSeekInfo = "components_query_log_seek_info" - -const ( - ColumnFile = "file" - - // File seek info offset. - ColumnOffset = "offset" - // File seek info whence. - ColumnWhence = "whence" -) - -func CreateTableLogFileSeekInfo(ctx context.Context, db *sql.DB) error { - _, err := db.ExecContext(ctx, fmt.Sprintf(` -CREATE TABLE IF NOT EXISTS %s ( - %s TEXT NOT NULL PRIMARY KEY, - %s INTEGER NOT NULL, - %s INTEGER NOT NULL -);`, TableNameLogFileSeekInfo, ColumnFile, ColumnOffset, ColumnWhence)) - return err -} - -func InsertLogFileSeekInfo(ctx context.Context, db *sql.DB, file string, offset int64, whence int64) error { - query := fmt.Sprintf(` -INSERT OR REPLACE INTO %s (%s, %s, %s) VALUES (?, ?, ?); -`, - TableNameLogFileSeekInfo, - ColumnFile, - ColumnOffset, - ColumnWhence, - ) - - start := time.Now() - _, err := db.ExecContext(ctx, query, file, offset, whence) - sqlite.RecordInsertUpdate(time.Since(start).Seconds()) - - return err -} - -// Returns "database/sql.ErrNoRows" if no record is found. -func GetLogFileSeekInfo(ctx context.Context, db *sql.DB, file string) (int64, int64, error) { - query := fmt.Sprintf(`SELECT %s, %s FROM %s WHERE %s = ?;`, ColumnOffset, ColumnWhence, TableNameLogFileSeekInfo, ColumnFile) - - start := time.Now() - row := db.QueryRowContext(ctx, query, file) - sqlite.RecordSelect(time.Since(start).Seconds()) - - var offset, whence int64 - err := row.Scan(&offset, &whence) - return offset, whence, err -} diff --git a/pkg/query/log/state/state_test.go b/pkg/query/log/state/state_test.go deleted file mode 100644 index 7f68d880..00000000 --- a/pkg/query/log/state/state_test.go +++ /dev/null @@ -1,103 +0,0 @@ -package state_test - -import ( - "context" - "database/sql" - "math/rand" - "os" - "testing" - "time" - - logstate "github.com/leptonai/gpud/pkg/query/log/state" - "github.com/leptonai/gpud/pkg/sqlite" -) - -func TestOpenMemory(t *testing.T) { - db, err := sqlite.Open(":memory:") - if err != nil { - t.Fatalf("failed to open database: %v", err) - } - defer db.Close() - - ctx, cancel := context.WithTimeout(context.Background(), time.Minute) - defer cancel() - - if err := logstate.CreateTableLogFileSeekInfo(ctx, db); err != nil { - t.Fatalf("failed to create log table: %v", err) - } - - offset := rand.Int63n(10000) - whence := rand.Int63n(100) - if err := logstate.InsertLogFileSeekInfo(ctx, db, "test-file", offset, whence); err != nil { - t.Fatalf("failed to insert log: %v", err) - } - - offset2, whence2, err := logstate.GetLogFileSeekInfo(ctx, db, "test-file") - if err != nil { - t.Fatalf("failed to get log: %v", err) - } - if offset != offset2 || whence != whence2 { - t.Fatalf("log mismatch: %d %d %d %d", offset, whence, offset2, whence2) - } - - if _, _, err := logstate.GetLogFileSeekInfo(ctx, db, "invalid"); err != sql.ErrNoRows { - t.Fatalf("expected sql.ErrNoRows, got %v", err) - } -} - -func TestOpen(t *testing.T) { - f, err := os.CreateTemp(os.TempDir(), "test-db") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(f.Name()) - - db, err := sqlite.Open(f.Name()) - if err != nil { - t.Fatalf("failed to open database: %v", err) - } - - ctx, cancel := context.WithTimeout(context.Background(), time.Minute) - defer cancel() - - if err := logstate.CreateTableLogFileSeekInfo(ctx, db); err != nil { - t.Fatalf("failed to create log table: %v", err) - } - - offset := rand.Int63n(10000) - whence := rand.Int63n(100) - if err := logstate.InsertLogFileSeekInfo(ctx, db, "test-file", offset, whence); err != nil { - t.Fatalf("failed to insert log: %v", err) - } - if err := logstate.InsertLogFileSeekInfo(ctx, db, "test-file", offset+1, whence); err != nil { - t.Fatalf("failed to insert log: %v", err) - } - - offset2, whence2, err := logstate.GetLogFileSeekInfo(ctx, db, "test-file") - if err != nil { - t.Fatalf("failed to get log: %v", err) - } - if offset+1 != offset2 || whence != whence2 { - t.Fatalf("log mismatch: %d %d %d %d", offset+1, whence, offset2, whence2) - } - - if _, _, err := logstate.GetLogFileSeekInfo(ctx, db, "invalid"); err != sql.ErrNoRows { - t.Fatalf("expected sql.ErrNoRows, got %v", err) - } - - db.Close() - - db, err = sqlite.Open(f.Name()) - if err != nil { - t.Fatalf("failed to open database: %v", err) - } - defer db.Close() - - offset3, whence3, err := logstate.GetLogFileSeekInfo(ctx, db, "test-file") - if err != nil { - t.Fatalf("failed to get log: %v", err) - } - if offset+1 != offset3 || whence != whence3 { - t.Fatalf("log mismatch: %d %d %d %d", offset+1, whence, offset3, whence3) - } -} diff --git a/pkg/query/log/tail/doc.go b/pkg/query/log/tail/doc.go deleted file mode 100644 index 451243c7..00000000 --- a/pkg/query/log/tail/doc.go +++ /dev/null @@ -1,2 +0,0 @@ -// Package tail implements the log file/output tail-ing operations. -package tail diff --git a/pkg/query/log/tail/options.go b/pkg/query/log/tail/options.go deleted file mode 100644 index b4b9db72..00000000 --- a/pkg/query/log/tail/options.go +++ /dev/null @@ -1,259 +0,0 @@ -package tail - -import ( - "errors" - "time" - - query_log_common "github.com/leptonai/gpud/pkg/query/log/common" -) - -type Op struct { - labels map[string]string - - file string - commands [][]string - - linesToTail int - dedup bool - - perLineFunc func([]byte) - - matchFuncs []query_log_common.MatchFunc - selectFilters []*query_log_common.Filter - rejectFilters []*query_log_common.Filter - - extractTime query_log_common.ExtractTimeFunc - skipEmptyLine bool - - ProcessMatched query_log_common.ProcessMatchedFunc -} - -type OpOption func(*Op) - -func (op *Op) ApplyOpts(opts []OpOption) error { - for _, opt := range opts { - opt(op) - } - - if op.labels == nil { - op.labels = make(map[string]string) - } - - if op.file == "" && len(op.commands) == 0 { - return errors.New("file or commands must be set") - } - - if op.linesToTail == 0 { - op.linesToTail = 100 - } - - if len(op.selectFilters) > 0 && len(op.rejectFilters) > 0 { - return errors.New("cannot set both select and reject filters") - } - for i := range op.selectFilters { - if err := op.selectFilters[i].Compile(); err != nil { - return err - } - } - for i := range op.rejectFilters { - if err := op.rejectFilters[i].Compile(); err != nil { - return err - } - } - - if op.extractTime == nil { - op.extractTime = func(line []byte) (time.Time, []byte, error) { - return time.Time{}, line, nil - } - } - if op.ProcessMatched == nil { - op.ProcessMatched = func(time.Time, []byte, *query_log_common.Filter) {} - } - - return nil -} - -func WithLabel(key, value string) OpOption { - return func(op *Op) { - if op.labels == nil { - op.labels = make(map[string]string) - } - op.labels[key] = value - } -} - -func WithFile(file string) OpOption { - return func(op *Op) { - op.file = file - } -} - -func WithCommands(commands [][]string) OpOption { - return func(op *Op) { - op.commands = commands - } -} - -// Sets the number of lines to tail. -// If not set, defaults to 100. -func WithLinesToTail(n int) OpOption { - return func(op *Op) { - op.linesToTail = n - } -} - -// If true, dedup lines by the log line string. -// This is useful for logs that have the same message -// repeated multiple times with the same timestamp. -func WithDedup(dedup bool) OpOption { - return func(op *Op) { - op.dedup = dedup - } -} - -// Called for each line. -func WithPerLineFunc(f func([]byte)) OpOption { - return func(op *Op) { - op.perLineFunc = f - } -} - -// "OR" conditions to select logs. -// The line is sent when any of the match function returns non-empty strings. -func WithMatchFunc(matchFuncs ...query_log_common.MatchFunc) OpOption { - return func(op *Op) { - if len(matchFuncs) > 0 { - op.matchFuncs = append(op.matchFuncs, matchFuncs...) - } - } -} - -// "OR" conditions to select logs. -// -// The line is sent when any of the filters match. -// Useful for explicit blacklisting "error" logs -// (e.g., GPU error messages in dmesg). -func WithSelectFilter(filters ...*query_log_common.Filter) OpOption { - return func(op *Op) { - if len(filters) > 0 { - op.selectFilters = append(op.selectFilters, filters...) - } - } -} - -// "AND" conditions to exclude logs. -// -// The line is sent if and only if all of the filters do not match. -// Useful for explicit whitelisting logs and catch all other -// (e.g., good healthy log messages). -func WithRejectFilter(filters ...*query_log_common.Filter) OpOption { - return func(op *Op) { - if len(filters) > 0 { - op.rejectFilters = append(op.rejectFilters, filters...) - } - } -} - -func (op *Op) applyFilter(line any) (shouldInclude bool, matchedFilter *query_log_common.Filter, err error) { - if len(op.matchFuncs) == 0 && len(op.selectFilters) == 0 && len(op.rejectFilters) == 0 { - // no filters - return true, nil, nil - } - - for _, matchFunc := range op.matchFuncs { - var eventName string - switch line := line.(type) { - case string: - eventName, _ = matchFunc(line) - case []byte: - eventName, _ = matchFunc(string(line)) - } - if eventName != "" { - filter := &query_log_common.Filter{ - Name: eventName, - } - return true, filter, nil - } - } - if len(op.selectFilters) == 0 && len(op.rejectFilters) == 0 { - return false, nil, nil - } - - // blacklist (e.g., error logs) - for _, filter := range op.selectFilters { - // assume regex is already compiled - var matched bool - switch line := line.(type) { - case string: - matched, err = filter.MatchString(line) - case []byte: - matched, err = filter.MatchBytes(line) - } - if err != nil { // regex has not been compiled - return false, nil, err - } - if matched { - matchedFilter = filter - break - } - } - if len(op.selectFilters) > 0 && matchedFilter == nil { - // select filter non-empty, and the line didn't pass any - // thus should not be included - return false, nil, nil - } - - // whitelist (e.g., good logs) - rejected := false - for _, filter := range op.rejectFilters { - // assume regex is already compiled - var matched bool - switch line := line.(type) { - case string: - matched, err = filter.MatchString(line) - case []byte: - matched, err = filter.MatchBytes(line) - } - if err != nil { // regex has not been compiled - return false, nil, err - } - if matched { - rejected = true - break - } - } - - if rejected { - // means, the line matches a good log line regex - // thus should not be marked as an event - return false, nil, nil - } - - return true, matchedFilter, nil -} - -func WithExtractTime(f query_log_common.ExtractTimeFunc) OpOption { - return func(op *Op) { - if f != nil { - op.extractTime = f - } - } -} - -func WithSkipEmptyLine(skipEmptyLine bool) OpOption { - return func(op *Op) { - op.skipEmptyLine = skipEmptyLine - } -} - -// Called if the line is matched. -// If not set, the matched line is no-op. -// Useful to append to a slice or not to return a string slice -// to avoid extra heap allocation. -func WithProcessMatched(f query_log_common.ProcessMatchedFunc) OpOption { - return func(op *Op) { - if f != nil { - op.ProcessMatched = f - } - } -} diff --git a/pkg/query/log/tail/scan.go b/pkg/query/log/tail/scan.go deleted file mode 100644 index 066f955a..00000000 --- a/pkg/query/log/tail/scan.go +++ /dev/null @@ -1,205 +0,0 @@ -package tail - -import ( - "context" - "errors" - "io" - "os" - "sync" - "time" - - "github.com/leptonai/gpud/pkg/log" - "github.com/leptonai/gpud/pkg/process" -) - -var dedupMapPool = sync.Pool{ - New: func() interface{} { - return make(map[string]struct{}, 200) - }, -} - -// Scan scans the file or commands output from the end of the file -// and return the number of matched lines. -// It returns the lines in the reverse order that evaluates true -// for the "match" function. -// If the match function is nil, returns all. -func Scan(ctx context.Context, opts ...OpOption) (int, error) { - op := &Op{} - if err := op.ApplyOpts(opts); err != nil { - return 0, err - } - - file := op.file - if file == "" { - if len(op.commands) == 0 { - return 0, errors.New("file or commands must be set") - } - - f, err := os.CreateTemp(os.TempDir(), "tailscan*.txt") - if err != nil { - return 0, err - } - defer os.Remove(f.Name()) - file = f.Name() - - log.Logger.Debugw("writing commands to file to scan", "commands", op.commands) - p, err := process.New(process.WithCommands(op.commands), process.WithRunAsBashScript(), process.WithOutputFile(f)) - if err != nil { - return 0, err - } - if err := p.Start(ctx); err != nil { - return 0, err - } - defer func() { - if err := p.Close(ctx); err != nil { - log.Logger.Warnw("failed to abort command", "err", err) - } - }() - - select { - case <-ctx.Done(): - return 0, ctx.Err() - case err := <-p.Wait(): - if err != nil { - return 0, err - } - } - if err := f.Sync(); err != nil { - return 0, err - } - } - - f, err := os.Open(file) - if err != nil { - return 0, err - } - defer f.Close() - - stat, err := f.Stat() - if err != nil { - return 0, err - } - fileSize := stat.Size() - - // use regular buffers for chunk and line reading - chunkBuf := make([]byte, 4096) - lineBuf := make([]byte, 0, 256) - - // read backwards from the end of the file - scannedLines := 0 - matchedLines := 0 - - var dedupedLines map[string]struct{} - if op.dedup { - // only use sync.Pool for dedup map - dedupedLines = dedupMapPool.Get().(map[string]struct{}) - defer func() { - // clear the map before returning it to pool - for k := range dedupedLines { - delete(dedupedLines, k) - } - dedupMapPool.Put(dedupedLines) - }() - } - - processLine := func(buf []byte) error { - reverse(buf) - scannedLines++ - - if op.perLineFunc != nil { - op.perLineFunc(buf) - } - - shouldInclude, matchedFilter, err := op.applyFilter(buf) - if err != nil { - return err - } - if !shouldInclude { - return nil - } - - if op.dedup { - if _, ok := dedupedLines[string(buf)]; ok { - // skip duplicate - return nil - } - - dedupedLines[string(buf)] = struct{}{} - } - - matchedLines++ - - var extractedTime time.Time - if op.extractTime != nil { - parsedTime, extractedLine, err := op.extractTime(buf) - if err != nil { - return err - } - if len(extractedLine) > 0 { - extractedTime = parsedTime.UTC() - buf = extractedLine - } - } - - if op.ProcessMatched != nil { - op.ProcessMatched(extractedTime, buf, matchedFilter) - } - - return nil - } - - defer func() { - log.Logger.Debugw("scanned lines", "lines", scannedLines, "matched", matchedLines) - }() - for offset := fileSize; offset > 0; { - chunkSize := int64(len(chunkBuf)) - if offset < chunkSize { - chunkSize = offset - } - offset -= chunkSize - - if _, serr := f.Seek(offset, io.SeekStart); serr != nil { - return 0, serr - } - if _, rerr := f.Read(chunkBuf[:chunkSize]); rerr != nil { - return 0, rerr - } - - for i := chunkSize - 1; i >= 0; i-- { - if scannedLines == op.linesToTail { - return matchedLines, nil - } - - // still processing a line - if chunkBuf[i] != '\n' { - lineBuf = append(lineBuf, chunkBuf[i]) - continue - } - - // end of a line but no content - if len(lineBuf) == 0 { - continue - } - - if err := processLine(lineBuf); err != nil { - return 0, err - } - - lineBuf = lineBuf[:0] - } - } - - if len(lineBuf) > 0 && scannedLines < op.linesToTail { - if err := processLine(lineBuf); err != nil { - return 0, err - } - } - - return matchedLines, nil -} - -func reverse(b []byte) { - for i, j := 0, len(b)-1; i < j; i, j = i+1, j-1 { - b[i], b[j] = b[j], b[i] - } -} diff --git a/pkg/query/log/tail/scan_benchmark_test.go b/pkg/query/log/tail/scan_benchmark_test.go deleted file mode 100644 index dbd48861..00000000 --- a/pkg/query/log/tail/scan_benchmark_test.go +++ /dev/null @@ -1,113 +0,0 @@ -package tail - -import ( - "context" - "testing" - "time" - - query_log_common "github.com/leptonai/gpud/pkg/query/log/common" - - "k8s.io/utils/ptr" -) - -// go test -bench=BenchmarkScan -benchmem -// go test -bench=BenchmarkScan_DmesgLog -benchmem -func BenchmarkScan_DmesgLog(b *testing.B) { - ctx := context.Background() - - benchmarks := []struct { - name string - linesToTail int - withFilter bool - dedup bool - }{ - {"Tail100NoFilter", 100, false, false}, - {"Tail1000NoFilter", 1000, false, false}, - {"Tail100WithFilter", 100, true, false}, - {"Tail1000WithFilter", 1000, true, false}, - - {"Tail100NoFilterWithDedup", 100, false, true}, - {"Tail1000NoFilterWithDedup", 1000, false, true}, - {"Tail100WithFilterWithDedup", 100, true, true}, - {"Tail1000WithFilterWithDedup", 1000, true, true}, - } - - for _, bm := range benchmarks { - b.Run(bm.name, func(b *testing.B) { - var opts []OpOption - opts = append(opts, - WithFile("testdata/dmesg.0.log"), - WithLinesToTail(bm.linesToTail), - WithExtractTime(func(line []byte) (time.Time, []byte, error) { - return time.Time{}, nil, nil - }), - WithProcessMatched(func(_ time.Time, line []byte, _ *query_log_common.Filter) {}), - ) - - if bm.withFilter { - opts = append(opts, WithSelectFilter(&query_log_common.Filter{ - Substring: ptr.To("error"), - })) - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := Scan(ctx, opts...) - if err != nil { - b.Fatal(err) - } - } - }) - } -} - -// go test -bench=BenchmarkScan -benchmem -// go test -bench=BenchmarkScan_KubeletLog -benchmem -func BenchmarkScan_KubeletLog(b *testing.B) { - ctx := context.Background() - - benchmarks := []struct { - name string - linesToTail int - withFilter bool - dedup bool - }{ - {"Tail100NoFilter", 100, false, false}, - {"Tail1000NoFilter", 1000, false, false}, - {"Tail100WithFilter", 100, true, false}, - {"Tail1000WithFilter", 1000, true, false}, - - {"Tail100NoFilterWithDedup", 100, false, true}, - {"Tail1000NoFilterWithDedup", 1000, false, true}, - {"Tail100WithFilterWithDedup", 100, true, true}, - {"Tail1000WithFilterWithDedup", 1000, true, true}, - } - - for _, bm := range benchmarks { - b.Run(bm.name, func(b *testing.B) { - var opts []OpOption - opts = append(opts, - WithFile("testdata/kubelet.0.log"), - WithLinesToTail(bm.linesToTail), - WithExtractTime(func(line []byte) (time.Time, []byte, error) { - return time.Time{}, nil, nil - }), - WithProcessMatched(func(_ time.Time, line []byte, _ *query_log_common.Filter) {}), - ) - - if bm.withFilter { - opts = append(opts, WithSelectFilter(&query_log_common.Filter{ - Substring: ptr.To("error"), - })) - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := Scan(ctx, opts...) - if err != nil { - b.Fatal(err) - } - } - }) - } -} diff --git a/pkg/query/log/tail/scan_test.go b/pkg/query/log/tail/scan_test.go deleted file mode 100644 index 75415b59..00000000 --- a/pkg/query/log/tail/scan_test.go +++ /dev/null @@ -1,906 +0,0 @@ -package tail - -import ( - "bytes" - "context" - "fmt" - "os" - "reflect" - "strings" - "testing" - "time" - - "k8s.io/utils/ptr" - - query_log_common "github.com/leptonai/gpud/pkg/query/log/common" -) - -func TestScan(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), time.Minute) - defer cancel() - - tmpf, err := os.CreateTemp("", "test*.txt") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(tmpf.Name()) - - content := "line1\nline2\nline3\nline4\nline5\n" - if _, err := tmpf.Write([]byte(content)); err != nil { - t.Fatalf("failed to write to temp file: %v", err) - } - if err := tmpf.Close(); err != nil { - t.Fatalf("failed to close temp file: %v", err) - } - t.Logf("wrote %q", tmpf.Name()) - - largeTmpf, err := os.CreateTemp("", "large_test*.txt") - if err != nil { - t.Fatalf("failed to create large temp file: %v", err) - } - defer os.Remove(largeTmpf.Name()) - - // Write 1000 lines to the large file - for i := 1; i <= 1000; i++ { - if _, err := largeTmpf.WriteString(fmt.Sprintf("line%d\n", i)); err != nil { - t.Fatalf("failed to write to large temp file: %v", err) - } - } - if err := largeTmpf.Close(); err != nil { - t.Fatalf("failed to close large temp file: %v", err) - } - - tests := []struct { - name string - fileName string - commandArgs []string - n int - matchFuncs []query_log_common.MatchFunc - selectFilters []*query_log_common.Filter - want []string - wantError bool - }{ - { - name: "tail 3 lines", - fileName: tmpf.Name(), - n: 3, - want: []string{"line5", "line4", "line3"}, - }, - { - name: "tail more lines than file contains", - fileName: tmpf.Name(), - n: 10, - want: []string{"line5", "line4", "line3", "line2", "line1"}, - }, - { - name: "tail with match function", - fileName: tmpf.Name(), - n: 3, - selectFilters: []*query_log_common.Filter{ - {Regex: ptr.To("3")}, - {Regex: ptr.To("5")}, - }, - want: []string{"line5", "line3"}, - }, - { - name: "tail with match function", - fileName: tmpf.Name(), - n: 3, - selectFilters: []*query_log_common.Filter{ - {Substring: ptr.To("3")}, - {Substring: ptr.To("5")}, - }, - want: []string{"line5", "line3"}, - }, - { - name: "non-existent file", - fileName: "non-existent_file", - n: 3, - wantError: true, - }, - - { - name: "tail 100 lines from large file", - fileName: largeTmpf.Name(), - n: 100, - want: generateExpectedLines(1000, 100), - }, - { - name: "tail 100 lines from large file but with cat", - commandArgs: []string{"cat", largeTmpf.Name()}, - n: 100, - want: generateExpectedLines(1000, 100), - }, - - { - name: "tail 1000 lines from large file", - fileName: largeTmpf.Name(), - n: 1000, - want: generateExpectedLines(1000, 1000), - }, - { - name: "tail 1000 lines from large file but with cat", - commandArgs: []string{"cat", largeTmpf.Name()}, - n: 1000, - want: generateExpectedLines(1000, 1000), - }, - - { - name: "tail with regex filter on large file", - fileName: largeTmpf.Name(), - n: 1000, - selectFilters: []*query_log_common.Filter{ - {Regex: ptr.To("line(50|100|150)")}, - }, - want: []string{"line1000", "line509", "line508", "line507", "line506", "line505", "line504", "line503", "line502", "line501", "line500", "line150", "line100", "line50"}, - }, - { - name: "tail with regex filter on large file but with cat", - commandArgs: []string{"cat", largeTmpf.Name()}, - n: 1000, - selectFilters: []*query_log_common.Filter{ - {Regex: ptr.To("line(50|100|150)")}, - }, - want: []string{"line1000", "line509", "line508", "line507", "line506", "line505", "line504", "line503", "line502", "line501", "line500", "line150", "line100", "line50"}, - }, - - { - name: "tail kubelet.0.log", - fileName: "testdata/kubelet.0.log", - n: 5, - want: nil, // We'll check the length instead of exact content - }, - { - name: "tail kubelet.0.log but with cat", - commandArgs: []string{"cat", "testdata/kubelet.0.log"}, - n: 5, - want: nil, // We'll check the length instead of exact content - }, - - { - name: "tail kubelet.0.log with filter", - fileName: "testdata/kubelet.0.log", - n: 1000, - selectFilters: []*query_log_common.Filter{ - {Substring: ptr.To("error")}, - }, - want: nil, // We'll check the length instead of exact content - }, - { - name: "tail kubelet.0.log with filter but with cat", - commandArgs: []string{"cat", "testdata/kubelet.0.log"}, - n: 1000, - selectFilters: []*query_log_common.Filter{ - {Substring: ptr.To("error")}, - }, - want: nil, // We'll check the length instead of exact content - }, - { - name: "tail with match function - single match", - fileName: tmpf.Name(), - n: 5, - matchFuncs: []query_log_common.MatchFunc{ - func(line string) (string, string) { - if line == "line3" { - return "found_line3", line - } - return "", "" - }, - }, - want: []string{"line3"}, - }, - { - name: "tail with match function - multiple matches", - fileName: tmpf.Name(), - n: 5, - matchFuncs: []query_log_common.MatchFunc{ - func(line string) (string, string) { - if strings.Contains(line, "line3") || strings.Contains(line, "line5") { - return "found_line3_or_5", line - } - return "", "" - }, - }, - want: []string{"line5", "line3"}, - }, - { - name: "tail with multiple match functions", - fileName: tmpf.Name(), - n: 5, - matchFuncs: []query_log_common.MatchFunc{ - func(line string) (string, string) { - if strings.Contains(line, "line3") { - return "found_line3", line - } - return "", "" - }, - func(line string) (string, string) { - if strings.Contains(line, "line5") { - return "found_line5", line - } - return "", "" - }, - }, - want: []string{"line5", "line3"}, - }, - { - name: "tail with match function - no matches", - fileName: tmpf.Name(), - n: 5, - matchFuncs: []query_log_common.MatchFunc{ - func(line string) (string, string) { - return "", "" - }, - }, - want: nil, - }, - { - name: "tail with match function on large file", - fileName: largeTmpf.Name(), - n: 1000, - matchFuncs: []query_log_common.MatchFunc{ - func(line string) (string, string) { - if strings.Contains(line, "line100") || strings.Contains(line, "line500") { - return "found_special_lines", line - } - return "", "" - }, - }, - want: []string{"line1000", "line500", "line100"}, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - var got []string - _, err := Scan( - ctx, - WithFile(tt.fileName), - WithCommands([][]string{tt.commandArgs}), - WithLinesToTail(tt.n), - WithSelectFilter(tt.selectFilters...), - WithMatchFunc(tt.matchFuncs...), - WithExtractTime(func(line []byte) (time.Time, []byte, error) { - return time.Time{}, nil, nil - }), - WithProcessMatched(func(time time.Time, line []byte, filter *query_log_common.Filter) { - got = append(got, string(line)) - }), - ) - - if (err != nil) != tt.wantError { - t.Errorf("Scan = %v, wantError %v", err, tt.wantError) - return - } - - if tt.fileName == "testdata/kubelet.0.log" || strings.Contains(strings.Join(tt.commandArgs, " "), "testdata/kubelet.0.log") { - // For kubelet.0.log, we'll just check if we got any results - if len(got) == 0 { - t.Errorf("Scan on kubelet.0.log returned no results") - } - if tt.selectFilters != nil && len(got) == 0 { - t.Errorf("Scan on kubelet.0.log with filter returned no results") - } - } else if !reflect.DeepEqual(got, tt.want) { - t.Errorf("Scan = %q, want %q", got, tt.want) - } - }) - } -} - -func generateExpectedLines(total, n int) []string { - var result []string - for i := total; i > total-n && i > 0; i-- { - result = append(result, fmt.Sprintf("line%d", i)) - } - return result -} - -func TestScan_LastLineWithoutNewline(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - // Create temp file with content that doesn't end in newline - tmpf, err := os.CreateTemp("", "test_nonewline*.txt") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(tmpf.Name()) - - // Write content without final newline - content := "line1\nline2\nline3\nfinal_line_no_newline" - if _, err := tmpf.Write([]byte(content)); err != nil { - t.Fatalf("failed to write to temp file: %v", err) - } - if err := tmpf.Close(); err != nil { - t.Fatalf("failed to close temp file: %v", err) - } - - tests := []struct { - name string - linesToTail int - selectFilters []*query_log_common.Filter - want []string - }{ - { - name: "tail 2 lines with last line having no newline", - linesToTail: 2, - want: []string{"final_line_no_newline", "line3"}, - }, - { - name: "tail all lines with last line having no newline", - linesToTail: 5, - want: []string{"final_line_no_newline", "line3", "line2", "line1"}, - }, - { - name: "tail with filter matching last line", - linesToTail: 5, - selectFilters: []*query_log_common.Filter{ - {Substring: ptr.To("final")}, - }, - want: []string{"final_line_no_newline"}, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - var got []string - _, err := Scan( - ctx, - WithFile(tmpf.Name()), - WithLinesToTail(tt.linesToTail), - WithSelectFilter(tt.selectFilters...), - WithExtractTime(func(line []byte) (time.Time, []byte, error) { - return time.Time{}, nil, nil - }), - WithProcessMatched(func(time time.Time, line []byte, filter *query_log_common.Filter) { - got = append(got, string(line)) - }), - ) - - if err != nil { - t.Errorf("unexpected error: %v", err) - return - } - - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("Scan = %v, want %v", got, tt.want) - } - }) - } -} - -func TestScan_Dedup(t *testing.T) { - ctx := context.Background() - - // Create temp file with duplicate lines - tmpf, err := os.CreateTemp("", "test_dedup*.txt") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(tmpf.Name()) - - // Write content with duplicate lines in different patterns - content := strings.Join([]string{ - "unique_line_1", - "duplicate_line", - "unique_line_2", - "duplicate_line", // Immediate duplicate - "unique_line_3", - "duplicate_line", // Distant duplicate - "unique_line_4", - "DUPLICATE_LINE", // Case different but same content when lowercased - "unique_line_5", - "duplicate_line\n", // With trailing newline - "unique_line_6", - }, "\n") - - if _, err := tmpf.Write([]byte(content)); err != nil { - t.Fatalf("failed to write to temp file: %v", err) - } - if err := tmpf.Close(); err != nil { - t.Fatalf("failed to close temp file: %v", err) - } - - tests := []struct { - name string - linesToTail int - dedup bool - want []string - wantCount int - }{ - { - name: "no dedup", - linesToTail: 100, - dedup: false, - want: []string{ - "unique_line_6", - "duplicate_line", - "unique_line_5", - "DUPLICATE_LINE", - "unique_line_4", - "duplicate_line", - "unique_line_3", - "duplicate_line", - "unique_line_2", - "duplicate_line", - "unique_line_1", - }, - wantCount: 11, - }, - { - name: "with dedup", - linesToTail: 100, - dedup: true, - want: []string{ - "unique_line_6", - "duplicate_line", - "unique_line_5", - "DUPLICATE_LINE", - "unique_line_4", - "unique_line_3", - "unique_line_2", - "unique_line_1", - }, - wantCount: 8, - }, - { - name: "dedup with limited lines", - linesToTail: 5, - dedup: true, - want: []string{ - "unique_line_6", - "duplicate_line", - "unique_line_5", - "DUPLICATE_LINE", - "unique_line_4", - }, - wantCount: 5, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - var got []string - count, err := Scan( - ctx, - WithFile(tmpf.Name()), - WithLinesToTail(tt.linesToTail), - WithDedup(tt.dedup), - WithExtractTime(func(line []byte) (time.Time, []byte, error) { - return time.Time{}, nil, nil - }), - WithProcessMatched(func(time time.Time, line []byte, filter *query_log_common.Filter) { - got = append(got, string(line)) - }), - ) - - if err != nil { - t.Errorf("unexpected error: %v", err) - return - } - - if count != tt.wantCount { - t.Errorf("got count = %d, want %d", count, tt.wantCount) - } - - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("got lines = %v, want %v", got, tt.want) - } - }) - } -} - -func TestScan_DedupWithFilters(t *testing.T) { - ctx := context.Background() - - // Create temp file with duplicate lines and different patterns - tmpf, err := os.CreateTemp("", "test_dedup_filter*.txt") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(tmpf.Name()) - - content := strings.Join([]string{ - "error: duplicate error message", - "info: some info", - "error: duplicate error message", - "warning: some warning", - "error: different error", - "error: duplicate error message", - "info: another info", - }, "\n") - - if _, err := tmpf.Write([]byte(content)); err != nil { - t.Fatalf("failed to write to temp file: %v", err) - } - if err := tmpf.Close(); err != nil { - t.Fatalf("failed to close temp file: %v", err) - } - - tests := []struct { - name string - linesToTail int - dedup bool - selectFilters []*query_log_common.Filter - want []string - wantCount int - }{ - { - name: "filter without dedup", - linesToTail: 100, - dedup: false, - selectFilters: []*query_log_common.Filter{ - {Substring: ptr.To("error")}, - }, - want: []string{ - "error: duplicate error message", - "error: different error", - "error: duplicate error message", - "error: duplicate error message", - }, - wantCount: 4, - }, - { - name: "filter with dedup", - linesToTail: 100, - dedup: true, - selectFilters: []*query_log_common.Filter{ - {Substring: ptr.To("error")}, - }, - want: []string{ - "error: duplicate error message", - "error: different error", - }, - wantCount: 2, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - var got []string - count, err := Scan( - ctx, - WithFile(tmpf.Name()), - WithLinesToTail(tt.linesToTail), - WithDedup(tt.dedup), - WithSelectFilter(tt.selectFilters...), - WithExtractTime(func(line []byte) (time.Time, []byte, error) { - return time.Time{}, nil, nil - }), - WithProcessMatched(func(time time.Time, line []byte, filter *query_log_common.Filter) { - got = append(got, string(line)) - }), - ) - - if err != nil { - t.Errorf("unexpected error: %v", err) - return - } - - if count != tt.wantCount { - t.Errorf("got count = %d, want %d", count, tt.wantCount) - } - - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("got lines = %v, want %v", got, tt.want) - } - }) - } -} - -func TestScan_EmptyAndSmallFiles(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - tests := []struct { - name string - content string - linesToTail int - want []string - wantCount int - }{ - { - name: "empty file", - content: "", - linesToTail: 10, - want: []string{}, - wantCount: 0, - }, - { - name: "single line without newline", - content: "single line", - linesToTail: 10, - want: []string{"single line"}, - wantCount: 1, - }, - { - name: "single line with newline", - content: "single line\n", - linesToTail: 10, - want: []string{"single line"}, - wantCount: 1, - }, - { - name: "multiple empty lines", - content: "\n\n\n\n", - linesToTail: 10, - want: nil, - wantCount: 0, - }, - { - name: "lines smaller than chunk size", - content: strings.Repeat("short\n", 1000), - linesToTail: 5, - want: []string{ - "short", - "short", - "short", - "short", - "short", - }, - wantCount: 5, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - tmpf, err := os.CreateTemp("", "test_empty*.txt") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(tmpf.Name()) - - if _, err := tmpf.Write([]byte(tt.content)); err != nil { - t.Fatalf("failed to write to temp file: %v", err) - } - if err := tmpf.Close(); err != nil { - t.Fatalf("failed to close temp file: %v", err) - } - - var got []string - count, err := Scan( - ctx, - WithFile(tmpf.Name()), - WithLinesToTail(tt.linesToTail), - WithExtractTime(func(line []byte) (time.Time, []byte, error) { - return time.Time{}, nil, nil - }), - WithProcessMatched(func(_ time.Time, line []byte, _ *query_log_common.Filter) { - got = append(got, string(line)) - }), - ) - - if err != nil { - t.Errorf("unexpected error: %v", err) - return - } - - if count != tt.wantCount { - t.Errorf("got count = %d, want %d", count, tt.wantCount) - } - - if len(got) == 0 && len(tt.want) == 0 { - return - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("got lines = %v, want %v", got, tt.want) - } - }) - } -} - -func TestScan_LongLines(t *testing.T) { - ctx := context.Background() - - // Create lines longer than the chunk size (4096 bytes) - longLine := strings.Repeat("x", 5000) - veryLongLine := strings.Repeat("y", 10000) - - tests := []struct { - name string - content string - linesToTail int - want []string - wantCount int - }{ - { - name: "single long line", - content: longLine, - linesToTail: 1, - want: []string{longLine}, - wantCount: 1, - }, - { - name: "multiple long lines", - content: longLine + "\n" + veryLongLine, - linesToTail: 2, - want: []string{veryLongLine, longLine}, - wantCount: 2, - }, - { - name: "mix of long and short lines", - content: strings.Join([]string{ - "short line", - longLine, - "another short line", - veryLongLine, - }, "\n"), - linesToTail: 3, - want: []string{ - veryLongLine, - "another short line", - longLine, - }, - wantCount: 3, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - tmpf, err := os.CreateTemp("", "test_longlines*.txt") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(tmpf.Name()) - - if _, err := tmpf.Write([]byte(tt.content)); err != nil { - t.Fatalf("failed to write to temp file: %v", err) - } - if err := tmpf.Close(); err != nil { - t.Fatalf("failed to close temp file: %v", err) - } - - var got []string - count, err := Scan( - ctx, - WithFile(tmpf.Name()), - WithLinesToTail(tt.linesToTail), - WithExtractTime(func(line []byte) (time.Time, []byte, error) { - return time.Time{}, nil, nil - }), - WithProcessMatched(func(_ time.Time, line []byte, _ *query_log_common.Filter) { - got = append(got, string(line)) - }), - ) - - if err != nil { - t.Errorf("unexpected error: %v", err) - return - } - - if count != tt.wantCount { - t.Errorf("got count = %d, want %d", count, tt.wantCount) - } - - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("got lines = %v, want %v", got, tt.want) - } - }) - } -} - -func TestScan_CommandOutput(t *testing.T) { - ctx := context.Background() - - tests := []struct { - name string - commands [][]string - linesToTail int - wantErr bool - wantCount int - }{ - { - name: "simple echo command", - commands: [][]string{ - {"echo", "line1\necho line2\necho line3"}, - }, - linesToTail: 2, - wantCount: 2, - }, - { - name: "multiple commands", - commands: [][]string{ - {"echo", "line1"}, - {"echo", "line2"}, - }, - linesToTail: 5, - wantCount: 2, - }, - { - name: "command with error", - commands: [][]string{ - {"nonexistent_command"}, - }, - linesToTail: 5, - wantErr: true, - }, - { - name: "no commands", - commands: [][]string{}, - linesToTail: 5, - wantErr: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - var got []string - count, err := Scan( - ctx, - WithCommands(tt.commands), - WithLinesToTail(tt.linesToTail), - WithExtractTime(func(line []byte) (time.Time, []byte, error) { - return time.Time{}, nil, nil - }), - WithProcessMatched(func(_ time.Time, line []byte, _ *query_log_common.Filter) { - got = append(got, string(line)) - }), - ) - - if tt.wantErr { - if err == nil { - t.Error("expected error but got none") - } - return - } - - if err != nil { - t.Errorf("unexpected error: %v", err) - return - } - - if count != tt.wantCount { - t.Errorf("got count = %d, want %d", count, tt.wantCount) - } - }) - } -} - -func TestReverse(t *testing.T) { - tests := []struct { - name string - input []byte - want []byte - }{ - { - name: "empty", - input: []byte{}, - want: []byte{}, - }, - { - name: "single byte", - input: []byte{1}, - want: []byte{1}, - }, - { - name: "even length", - input: []byte{1, 2, 3, 4}, - want: []byte{4, 3, 2, 1}, - }, - { - name: "odd length", - input: []byte{1, 2, 3}, - want: []byte{3, 2, 1}, - }, - { - name: "string content", - input: []byte("hello"), - want: []byte("olleh"), - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Make a copy to avoid modifying test data - input := make([]byte, len(tt.input)) - copy(input, tt.input) - - reverse(input) - - if !bytes.Equal(input, tt.want) { - t.Errorf("reverse() = %v, want %v", input, tt.want) - } - }) - } -} diff --git a/pkg/query/log/tail/streamer.go b/pkg/query/log/tail/streamer.go deleted file mode 100644 index ea23722e..00000000 --- a/pkg/query/log/tail/streamer.go +++ /dev/null @@ -1,38 +0,0 @@ -package tail - -import ( - "sync" - - query_log_common "github.com/leptonai/gpud/pkg/query/log/common" - - "github.com/nxadm/tail" -) - -// Streamer defines the log tailer. -type Streamer interface { - // Returns the file that the streamer watches on. - File() string - // Returns the command arguments that the streamer watches on. - Commands() [][]string - - // Returns the line channel that the streaming lines are sent to. - Line() <-chan Line -} - -type Line struct { - *tail.Line - MatchedFilter *query_log_common.Filter -} - -type streamDeduper struct { - seen map[string]struct{} - mu sync.Mutex -} - -var seenPool = sync.Pool{ - New: func() interface{} { - return &streamDeduper{ - seen: make(map[string]struct{}, 1000), - } - }, -} diff --git a/pkg/query/log/tail/streamer_command.go b/pkg/query/log/tail/streamer_command.go deleted file mode 100644 index 0cf682dd..00000000 --- a/pkg/query/log/tail/streamer_command.go +++ /dev/null @@ -1,195 +0,0 @@ -package tail - -import ( - "bufio" - "context" - "fmt" - "time" - - "github.com/leptonai/gpud/pkg/log" - "github.com/leptonai/gpud/pkg/process" - query_log_common "github.com/leptonai/gpud/pkg/query/log/common" - - "github.com/nxadm/tail" -) - -func NewFromCommand(ctx context.Context, commands [][]string, opts ...OpOption) (Streamer, error) { - op := &Op{ - commands: commands, - } - if err := op.ApplyOpts(opts); err != nil { - return nil, err - } - - processOpts := []process.OpOption{process.WithCommands(op.commands), process.WithRunAsBashScript()} - for k, v := range op.labels { - processOpts = append(processOpts, process.WithLabel(k, v)) - } - p, err := process.New(processOpts...) - if err != nil { - return nil, err - } - if err := p.Start(ctx); err != nil { - return nil, err - } - - select { - case <-ctx.Done(): - return nil, ctx.Err() - case err := <-p.Wait(): - return nil, fmt.Errorf("command exited unexpectedly: %w", err) - case <-time.After(50 * time.Millisecond): - } - - stdoutScanner := bufio.NewScanner(p.StdoutReader()) - stderrScanner := bufio.NewScanner(p.StderrReader()) - - streamer := &commandStreamer{ - op: op, - ctx: ctx, - proc: p, - lineC: make(chan Line, 200), - dedupEnabled: op.dedup, - skipEmptyLine: op.skipEmptyLine, - } - - if op.dedup { - streamer.dedup = seenPool.Get().(*streamDeduper) - } - - go streamer.pollLoops(stdoutScanner) - go streamer.pollLoops(stderrScanner) - go streamer.waitCommand() - - return streamer, nil -} - -var _ Streamer = (*commandStreamer)(nil) - -type commandStreamer struct { - op *Op - ctx context.Context - proc process.Process - lineC chan Line - - dedupEnabled bool - dedup *streamDeduper - skipEmptyLine bool -} - -func (sr *commandStreamer) File() string { - return "" -} - -func (sr *commandStreamer) Commands() [][]string { - return sr.op.commands -} - -func (sr *commandStreamer) Line() <-chan Line { - return sr.lineC -} - -func (sr *commandStreamer) pollLoops(scanner *bufio.Scanner) { - var ( - err error - shouldInclude bool - matchedFilter *query_log_common.Filter - ) - - for scanner.Scan() { - select { - case <-sr.ctx.Done(): - return - default: - } - - txt := scanner.Text() - - if len(txt) == 0 && sr.skipEmptyLine { - continue - } - - if sr.dedupEnabled { - sr.dedup.mu.Lock() - _, exists := sr.dedup.seen[txt] - if exists { - sr.dedup.mu.Unlock() - continue - } - sr.dedup.seen[txt] = struct{}{} - sr.dedup.mu.Unlock() - } - - shouldInclude, matchedFilter, err = sr.op.applyFilter(txt) - if err != nil { - log.Logger.Warnw("error applying filter", "error", err) - continue - } - if !shouldInclude { - continue - } - - var extractedTime time.Time - scannedBytes := scanner.Bytes() - - if sr.op.extractTime != nil { - parsedTime, extractedLine, err := sr.op.extractTime(scannedBytes) - if err != nil { - log.Logger.Errorw("error extracting time", "error", err) - } else if len(extractedLine) > 0 { - extractedTime = parsedTime.UTC() - scannedBytes = extractedLine - } - } - - if extractedTime.IsZero() { - extractedTime = time.Now().UTC() - } - - if sr.op.ProcessMatched != nil { - sr.op.ProcessMatched(extractedTime, scannedBytes, matchedFilter) - } - - lineToSend := Line{ - Line: &tail.Line{ - Text: string(scannedBytes), - Time: extractedTime, - }, - MatchedFilter: matchedFilter, - } - - select { - case <-sr.ctx.Done(): - return - - case sr.lineC <- lineToSend: - - default: - log.Logger.Warnw("channel is full -- dropped output", "pid", sr.proc.PID(), "labels", sr.proc.Labels()) - } - } -} - -func (sr *commandStreamer) waitCommand() { - defer func() { - close(sr.lineC) - - if sr.dedupEnabled { - sr.dedup.mu.Lock() - for k := range sr.dedup.seen { - delete(sr.dedup.seen, k) - } - sr.dedup.mu.Unlock() - seenPool.Put(sr.dedup) - } - - if err := sr.proc.Close(sr.ctx); err != nil { - log.Logger.Warnw("failed to abort command", "err", err) - } - }() - - select { - case <-sr.ctx.Done(): - case <-sr.proc.Wait(): - } -} diff --git a/pkg/query/log/tail/streamer_command_test.go b/pkg/query/log/tail/streamer_command_test.go deleted file mode 100644 index 9797fb05..00000000 --- a/pkg/query/log/tail/streamer_command_test.go +++ /dev/null @@ -1,97 +0,0 @@ -package tail - -import ( - "context" - "fmt" - "os" - "testing" - "time" -) - -func TestCommandStreamer(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), time.Minute) - defer cancel() - - tmpf, err := os.CreateTemp("", "test*.txt") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(tmpf.Name()) - - streamer, err := NewFromCommand(ctx, [][]string{{"tail", "-f", tmpf.Name()}}) - if err != nil { - t.Fatal(err) - } - - time.Sleep(time.Second) - - for i := 0; i < 10; i++ { - testLine := fmt.Sprintf("%d%d", i, time.Now().Nanosecond()) - if _, err := tmpf.WriteString(testLine + "\n"); err != nil { - t.Fatal(err) - } - - select { - case line := <-streamer.Line(): - t.Logf("received %q", line.Text) - if line.Text != testLine { - t.Fatalf("expected %q, got %q", testLine, line.Text) - } - case <-time.After(3 * time.Second): - t.Fatal("timeout") - } - } - - t.Logf("%+v\n", streamer.Commands()) -} - -func TestCommandStreamerWithDedup(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), time.Minute) - defer cancel() - - tmpf, err := os.CreateTemp("", "test*.txt") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(tmpf.Name()) - - streamer, err := NewFromCommand( - ctx, - [][]string{{"tail", "-f", tmpf.Name()}}, - WithDedup(true), - ) - if err != nil { - t.Fatal(err) - } - - time.Sleep(time.Second) - - // Write same line multiple times - testLine := "duplicate line" - for i := 0; i < 10; i++ { - if _, err := tmpf.WriteString(testLine + "\n"); err != nil { - t.Fatal(err) - } - } - - // Should only receive one line despite writing multiple - select { - case line := <-streamer.Line(): - t.Logf("received %q", line.Text) - if line.Text != testLine { - t.Fatalf("expected %q, got %q", testLine, line.Text) - } - case <-time.After(3 * time.Second): - t.Fatal("timeout waiting for first line") - } - - // Verify no more lines are received (as they should be deduped) - select { - case line := <-streamer.Line(): - t.Fatalf("unexpected line received: %q", line.Text) - case <-time.After(2 * time.Second): - // This is the expected path - no additional lines should be received - } - - t.Logf("%+v\n", streamer.Commands()) -} diff --git a/pkg/query/log/tail/streamer_file.go b/pkg/query/log/tail/streamer_file.go deleted file mode 100644 index 1d420229..00000000 --- a/pkg/query/log/tail/streamer_file.go +++ /dev/null @@ -1,159 +0,0 @@ -package tail - -import ( - "context" - "time" - - "github.com/leptonai/gpud/pkg/log" - query_log_common "github.com/leptonai/gpud/pkg/query/log/common" - - "github.com/nxadm/tail" -) - -func NewFromFile(ctx context.Context, file string, seek *tail.SeekInfo, opts ...OpOption) (Streamer, error) { - op := &Op{ - file: file, - } - if err := op.ApplyOpts(opts); err != nil { - return nil, err - } - - f, err := tail.TailFile( - file, - tail.Config{ - Location: seek, - - Follow: true, - ReOpen: true, - MustExist: false, - - // we don't need real-time logs - // using polling for reliability (vs. fsnotify) - Poll: true, - - Logger: tail.DefaultLogger, - }, - ) - if err != nil { - return nil, err - } - - sr := &fileStreamer{ - ctx: ctx, - op: op, - file: f, - lineC: make(chan Line, 1000), - dedupEnabled: op.dedup, - extractTime: op.extractTime, - skipEmptyLine: op.skipEmptyLine, - } - if op.dedup { - sr.dedup = seenPool.Get().(*streamDeduper) - } - - go sr.pollLoops() - - return sr, nil -} - -var _ Streamer = (*fileStreamer)(nil) - -type fileStreamer struct { - ctx context.Context - op *Op - file *tail.Tail - lineC chan Line - dedupEnabled bool - dedup *streamDeduper - extractTime query_log_common.ExtractTimeFunc - skipEmptyLine bool -} - -func (sr *fileStreamer) File() string { - return sr.file.Filename -} - -func (sr *fileStreamer) Commands() [][]string { - return nil -} - -func (sr *fileStreamer) Line() <-chan Line { - return sr.lineC -} - -func (sr *fileStreamer) pollLoops() { - prevTime := time.Time{} - for line := range sr.file.Lines { - shouldInclude, matchedFilter, err := sr.op.applyFilter(line.Text) - if err != nil { - log.Logger.Warnw("error applying filter", "error", err) - continue - } - if !shouldInclude { - continue - } - - txt := line.Text - - if len(txt) == 0 && sr.skipEmptyLine { - continue - } - - if sr.dedupEnabled { - sr.dedup.mu.Lock() - _, exists := sr.dedup.seen[txt] - if exists { - sr.dedup.mu.Unlock() - continue - } - sr.dedup.seen[txt] = struct{}{} - sr.dedup.mu.Unlock() - } - - if sr.extractTime != nil { - parsedTime, _, err := sr.extractTime([]byte(txt)) - if err == nil { - line.Time = parsedTime - } else { - log.Logger.Warnw("error extracting time", "error", err) - } - - if line.Time.IsZero() && !prevTime.IsZero() { - line.Time = prevTime - } - - if err == nil { - prevTime = parsedTime - } - } - - if sr.op.ProcessMatched != nil { - sr.op.ProcessMatched(line.Time, []byte(line.Text), matchedFilter) - } - - lineToSend := Line{ - Line: line, - MatchedFilter: matchedFilter, - } - - select { - case <-sr.ctx.Done(): - sr.file.Done() - - if sr.dedupEnabled { - sr.dedup.mu.Lock() - for k := range sr.dedup.seen { - delete(sr.dedup.seen, k) - } - sr.dedup.mu.Unlock() - seenPool.Put(sr.dedup) - } - return - - case sr.lineC <- lineToSend: - - default: - log.Logger.Warnw("channel is full -- dropped output", "file", sr.file.Filename) - } - } -} diff --git a/pkg/query/log/tail/streamer_file_test.go b/pkg/query/log/tail/streamer_file_test.go deleted file mode 100644 index 3fd0e8ab..00000000 --- a/pkg/query/log/tail/streamer_file_test.go +++ /dev/null @@ -1,159 +0,0 @@ -package tail - -import ( - "bytes" - "context" - "fmt" - "os" - "regexp" - "testing" - "time" - - "github.com/leptonai/gpud/pkg/log" -) - -func TestFileStreamer(t *testing.T) { - tmpf, err := os.CreateTemp("", "test*.txt") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(tmpf.Name()) - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - streamer, err := NewFromFile(ctx, tmpf.Name(), nil) - if err != nil { - t.Fatal(err) - } - - time.Sleep(time.Second) - - for i := 0; i < 10; i++ { - testLine := fmt.Sprintf("%d%d", i, time.Now().Nanosecond()) - if _, err := tmpf.WriteString(testLine + "\n"); err != nil { - t.Fatal(err) - } - - select { - case line := <-streamer.Line(): - t.Logf("received %q (%v, %+v)", line.Text, line.Time, line.SeekInfo) - if line.Text != testLine { - t.Fatalf("expected %q, got %q", testLine, line.Text) - } - case <-time.After(3 * time.Second): - t.Fatal("timeout") - } - } -} - -func TestFileStreamerWithDedup(t *testing.T) { - tmpf, err := os.CreateTemp("", "test*.txt") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - defer os.Remove(tmpf.Name()) - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - streamer, err := NewFromFile(ctx, tmpf.Name(), nil, WithDedup(true)) - if err != nil { - t.Fatal(err) - } - - time.Sleep(time.Second) - - // Write same line multiple times - testLine := "duplicate line" - for i := 0; i < 10; i++ { - if _, err := tmpf.WriteString(testLine + "\n"); err != nil { - t.Fatal(err) - } - } - - // Should only receive one line despite writing three - select { - case line := <-streamer.Line(): - t.Logf("received %q (%v, %+v)", line.Text, line.Time, line.SeekInfo) - if line.Text != testLine { - t.Fatalf("expected %q, got %q", testLine, line.Text) - } - case <-time.After(3 * time.Second): - t.Fatal("timeout waiting for first line") - } - - // Verify no more lines are received (as they should be deduped) - select { - case line := <-streamer.Line(): - t.Fatalf("unexpected line received: %q", line.Text) - case <-time.After(2 * time.Second): - // This is the expected path - no additional lines should be received - } -} - -func TestFileStreamerWithExtractTime(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - streamer, err := NewFromFile(ctx, "testdata/fabric-manager.0.log", nil, WithExtractTime(extractTimeFromLogLine), WithSkipEmptyLine(true)) - if err != nil { - t.Fatal(err) - } - - time.Sleep(time.Second) - - for i := 0; i < 30; i++ { - select { - case line := <-streamer.Line(): - t.Logf("received %q (%v, %+v)", line.Text, line.Time, line.SeekInfo) - - // "[Dec 18 2024" - if line.Time.IsZero() { - t.Fatalf("expected non-zero time, got %v", line.Time) - } - if line.Time.Year() != 2024 { - t.Fatalf("expected 2024, got %v", line.Time.Year()) - } - if line.Time.Month() != time.December { - t.Fatalf("expected December, got %v", line.Time.Month()) - } - if line.Time.Day() < 18 || line.Time.Day() > 20 { - t.Fatalf("expected day between 18 and 20, got %v", line.Time.Day()) - } - - case <-time.After(3 * time.Second): - t.Fatal("timeout") - } - } -} - -var regexForFabricmanagerLog = regexp.MustCompile(`^\[([^\]]+)\]`) - -const fabricmanagerLogTimeFormat = "Jan 02 2006 15:04:05" - -var fabricmanagerLogTimeFormatN = len(fabricmanagerLogTimeFormat) + 2 // [ ] - -// does not return error for now -// example log line: "[May 02 2024 18:41:23] [INFO] [tid 404868] Abort CUDA jobs when FM exits = 1" -// TODO: once stable return error -func extractTimeFromLogLine(line []byte) (time.Time, []byte, error) { - matches := regexForFabricmanagerLog.FindStringSubmatch(string(line)) - if len(matches) == 0 { - log.Logger.Debugw("no timestamp matches found", "line", string(line)) - return time.Time{}, nil, nil - } - - s := matches[1] - - parsedTime, err := time.Parse("Jan 02 2006 15:04:05", s) - if err != nil { - log.Logger.Debugw("failed to parse timestamp", "line", string(line), "error", err) - return time.Time{}, nil, nil - } - - if len(line) <= fabricmanagerLogTimeFormatN { - return parsedTime, nil, nil - } - - extractedLine := bytes.TrimSpace(line[fabricmanagerLogTimeFormatN:]) - return parsedTime, extractedLine, nil -} diff --git a/pkg/query/log/tail/testdata/dmesg.0.log b/pkg/query/log/tail/testdata/dmesg.0.log deleted file mode 100644 index 9fe38db5..00000000 --- a/pkg/query/log/tail/testdata/dmesg.0.log +++ /dev/null @@ -1,41 +0,0 @@ -2024-11-15T05:39:15,491363+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T05:39:15,502491+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T05:39:15,513618+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T05:39:15,524696+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T05:39:15,535709+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T05:39:15,546781+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T05:39:15,557860+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T05:39:15,568948+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T05:39:15,580123+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T05:39:15,591167+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:03,470068+00:00 NVRM: Xid (PCI:0000:19:00): 48, pid='', name=, An uncorrectable double bit error (DBE) has been detected on GPU in the framebuffer at physAddr 0x12bac1c3e0 partition 2, subpartition 1. -2024-11-15T12:02:03,498753+00:00 NVRM: Xid (PCI:0000:19:00): 63, pid='', name=, Row Remapper: New row (0x00000012bac1c3e0) marked for remapping, reset gpu to activate. -2024-11-15T12:02:03,506595+00:00 NVRM: Xid (PCI:0000:19:00): 94, pid='', name=, Contained: SM (0x1). RST: No, D-RST: No -2024-11-15T12:02:03,521616+00:00 NVRM: Xid (PCI:0000:19:00): 94, pid=1248209, name=python3.10, Ch 00000008 -2024-11-15T12:02:03,552642+00:00 NVRM: Xid (PCI:0000:19:00): 94, pid=1248209, name=python3.10, Ch 00000009 -2024-11-15T12:02:03,561522+00:00 NVRM: Xid (PCI:0000:19:00): 94, pid=1248209, name=python3.10, Ch 0000000a -2024-11-15T12:02:03,570101+00:00 NVRM: Xid (PCI:0000:19:00): 94, pid=1248209, name=python3.10, Ch 0000000b -2024-11-15T12:02:03,578496+00:00 NVRM: Xid (PCI:0000:19:00): 94, pid=1248209, name=python3.10, Ch 0000000c -2024-11-15T12:02:03,587265+00:00 NVRM: Xid (PCI:0000:19:00): 94, pid=1248209, name=python3.10, Ch 0000000d -2024-11-15T12:02:03,630019+00:00 NVRM: Xid (PCI:0000:19:00): 94, pid=1248209, name=python3.10, Ch 0000000e -2024-11-15T12:02:03,638794+00:00 NVRM: Xid (PCI:0000:19:00): 94, pid=1248209, name=python3.10, Ch 0000000f -2024-11-15T12:02:15,737526+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,748853+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,760199+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,771485+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,782907+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,794161+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,805365+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,816541+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,827798+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,838837+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,849947+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,861190+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,872315+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,883572+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,894792+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,905966+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,917157+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,928245+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,939340+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing -2024-11-15T12:02:15,950445+00:00 nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing \ No newline at end of file diff --git a/pkg/query/log/tail/testdata/fabric-manager.0.log b/pkg/query/log/tail/testdata/fabric-manager.0.log deleted file mode 100644 index 119a0690..00000000 --- a/pkg/query/log/tail/testdata/fabric-manager.0.log +++ /dev/null @@ -1,499 +0,0 @@ -[Dec 18 2024 19:09:50] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:81312046c77d043a status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe aa8f62a63b474b1c 42b6541a7c99b6c6 2d4131df9e28ae09 8e4e8b14fb982022 - -[Dec 18 2024 19:09:50] [INFO] [tid 1584] multicast group 1 is allocated. -[Dec 18 2024 19:09:50] [INFO] [tid 1584] successfully setup multicast team for request id 9309257393118184506. -[Dec 18 2024 19:09:50] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:81312046c77d043a status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:be1e310e660a89ee Flags:0 Address Base:c008000000000 Address Size:60000000 - -[Dec 18 2024 20:50:46] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:4110fd7539ed0d0 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:78cdb1f7062f4f83 Flags:0 - -[Dec 18 2024 20:50:46] [INFO] [tid 1584] multicast group 0 is freed. -[Dec 18 2024 20:50:46] [INFO] [tid 1584] successfully released multicast team with handle 8704809329295839107. -[Dec 18 2024 20:50:50] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:6750810887294585 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:be1e310e660a89ee Flags:0 - -[Dec 18 2024 20:50:50] [INFO] [tid 1584] multicast group 1 is freed. -[Dec 18 2024 20:50:50] [INFO] [tid 1584] successfully released multicast team with handle 13699441054418897390. -[Dec 18 2024 20:51:11] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:474aad0a33fb6a4 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe 2d4131df9e28ae09 aa8f62a63b474b1c 42b6541a7c99b6c6 8e4e8b14fb982022 - -[Dec 18 2024 20:51:11] [INFO] [tid 1584] multicast group 0 is allocated. -[Dec 18 2024 20:51:11] [INFO] [tid 1584] successfully setup multicast team for request id 321069286518929060. -[Dec 18 2024 20:51:11] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:474aad0a33fb6a4 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:964808ca391ac Flags:0 Address Base:c000000000000 Address Size:60000000 - -[Dec 18 2024 20:55:07] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:9e2e8b1fb8f4bea status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe aa8f62a63b474b1c 42b6541a7c99b6c6 8e4e8b14fb982022 2d4131df9e28ae09 - -[Dec 18 2024 20:55:07] [INFO] [tid 1584] multicast group 1 is allocated. -[Dec 18 2024 20:55:07] [INFO] [tid 1584] successfully setup multicast team for request id 712387542205287402. -[Dec 18 2024 20:55:07] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:9e2e8b1fb8f4bea status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:d6b847c17db40ca1 Flags:0 Address Base:c008000000000 Address Size:60000000 - -[Dec 18 2024 22:34:26] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:8d09fc52bf342405 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:964808ca391ac Flags:0 - -[Dec 18 2024 22:34:26] [INFO] [tid 1584] multicast group 0 is freed. -[Dec 18 2024 22:34:26] [INFO] [tid 1584] successfully released multicast team with handle 2643778068517292. -[Dec 18 2024 22:34:29] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:348a69958ddca488 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:d6b847c17db40ca1 Flags:0 - -[Dec 18 2024 22:34:29] [INFO] [tid 1584] multicast group 1 is freed. -[Dec 18 2024 22:34:29] [INFO] [tid 1584] successfully released multicast team with handle 15472195416194550945. -[Dec 18 2024 22:34:53] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:cc4711684cdb2102 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe aa8f62a63b474b1c 42b6541a7c99b6c6 8e4e8b14fb982022 2d4131df9e28ae09 - -[Dec 18 2024 22:34:53] [INFO] [tid 1584] multicast group 0 is allocated. -[Dec 18 2024 22:34:53] [INFO] [tid 1584] successfully setup multicast team for request id 14719753046747455746. -[Dec 18 2024 22:34:53] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:cc4711684cdb2102 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:6d83a908a8b0d1d4 Flags:0 Address Base:c000000000000 Address Size:60000000 - -[Dec 18 2024 22:38:37] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:14480877ee058b3a status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe 2d4131df9e28ae09 42b6541a7c99b6c6 aa8f62a63b474b1c 8e4e8b14fb982022 - -[Dec 18 2024 22:38:37] [INFO] [tid 1584] multicast group 1 is allocated. -[Dec 18 2024 22:38:37] [INFO] [tid 1584] successfully setup multicast team for request id 1461427390269197114. -[Dec 18 2024 22:38:37] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:14480877ee058b3a status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:979d23505db5f18 Flags:0 Address Base:c008000000000 Address Size:60000000 - -[Dec 19 2024 00:21:29] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:dec71a0f0d4e0cad status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:6d83a908a8b0d1d4 Flags:0 - -[Dec 19 2024 00:21:29] [INFO] [tid 1584] multicast group 0 is freed. -[Dec 19 2024 00:21:29] [INFO] [tid 1584] successfully released multicast team with handle 7891336826738233812. -[Dec 19 2024 00:21:32] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:6cd477f793909239 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:979d23505db5f18 Flags:0 - -[Dec 19 2024 00:21:32] [INFO] [tid 1584] multicast group 1 is freed. -[Dec 19 2024 00:21:32] [INFO] [tid 1584] successfully released multicast team with handle 682807943696703256. -[Dec 19 2024 00:21:57] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:8945132d9ce96b32 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe 8e4e8b14fb982022 2d4131df9e28ae09 42b6541a7c99b6c6 aa8f62a63b474b1c - -[Dec 19 2024 00:21:57] [INFO] [tid 1584] multicast group 0 is allocated. -[Dec 19 2024 00:21:57] [INFO] [tid 1584] successfully setup multicast team for request id 9891333243216161586. -[Dec 19 2024 00:21:57] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:8945132d9ce96b32 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:a89525ac41102638 Flags:0 Address Base:c000000000000 Address Size:60000000 - -[Dec 19 2024 00:25:55] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:6f8311d7764da20d status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d aa8f62a63b474b1c 2d4131df9e28ae09 10dbe3d570b7debe 42b6541a7c99b6c6 8e4e8b14fb982022 - -[Dec 19 2024 00:25:55] [INFO] [tid 1584] multicast group 1 is allocated. -[Dec 19 2024 00:25:55] [INFO] [tid 1584] successfully setup multicast team for request id 8035285777259536909. -[Dec 19 2024 00:25:55] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:6f8311d7764da20d status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:d49eacc764073ab1 Flags:0 Address Base:c008000000000 Address Size:60000000 - -[Dec 19 2024 02:12:52] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:8b33fc7f6a9e666 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:a89525ac41102638 Flags:0 - -[Dec 19 2024 02:12:52] [INFO] [tid 1584] multicast group 0 is freed. -[Dec 19 2024 02:12:52] [INFO] [tid 1584] successfully released multicast team with handle 12147656991657961016. -[Dec 19 2024 02:12:56] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:ce7318160f3f0735 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:d49eacc764073ab1 Flags:0 - -[Dec 19 2024 02:12:56] [INFO] [tid 1584] multicast group 1 is freed. -[Dec 19 2024 02:12:56] [INFO] [tid 1584] successfully released multicast team with handle 15320872954737670833. -[Dec 19 2024 02:13:18] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:ab27f999d1a250a9 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe 2d4131df9e28ae09 8e4e8b14fb982022 42b6541a7c99b6c6 aa8f62a63b474b1c - -[Dec 19 2024 02:13:18] [INFO] [tid 1584] multicast group 0 is allocated. -[Dec 19 2024 02:13:18] [INFO] [tid 1584] successfully setup multicast team for request id 12333100543619780777. -[Dec 19 2024 02:13:18] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:ab27f999d1a250a9 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:e9a52160a174f724 Flags:0 Address Base:c000000000000 Address Size:60000000 - -[Dec 19 2024 02:17:23] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:2cc1d727d41d10b7 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe 2d4131df9e28ae09 8e4e8b14fb982022 aa8f62a63b474b1c 42b6541a7c99b6c6 - -[Dec 19 2024 02:17:23] [INFO] [tid 1584] multicast group 1 is allocated. -[Dec 19 2024 02:17:23] [INFO] [tid 1584] successfully setup multicast team for request id 3225095374236356791. -[Dec 19 2024 02:17:23] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:2cc1d727d41d10b7 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:e3068dd12eb96b72 Flags:0 Address Base:c008000000000 Address Size:60000000 - -[Dec 19 2024 03:58:57] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:d84deeee6937d065 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:e9a52160a174f724 Flags:0 - -[Dec 19 2024 03:58:57] [INFO] [tid 1584] multicast group 0 is freed. -[Dec 19 2024 03:58:57] [INFO] [tid 1584] successfully released multicast team with handle 16835899480903841572. -[Dec 19 2024 03:59:01] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:9df8998d58020a85 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:e3068dd12eb96b72 Flags:0 - -[Dec 19 2024 03:59:01] [INFO] [tid 1584] multicast group 1 is freed. -[Dec 19 2024 03:59:01] [INFO] [tid 1584] successfully released multicast team with handle 16358918626041490290. -[Dec 19 2024 03:59:23] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:63444a283670ded9 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe 2d4131df9e28ae09 42b6541a7c99b6c6 8e4e8b14fb982022 aa8f62a63b474b1c - -[Dec 19 2024 03:59:23] [INFO] [tid 1584] multicast group 0 is allocated. -[Dec 19 2024 03:59:23] [INFO] [tid 1584] successfully setup multicast team for request id 7152923644743704281. -[Dec 19 2024 03:59:23] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:63444a283670ded9 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:bebe950509906d00 Flags:0 Address Base:c000000000000 Address Size:60000000 - -[Dec 19 2024 04:03:08] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:46503cdff1320da1 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 2d4131df9e28ae09 aa8f62a63b474b1c 8e4e8b14fb982022 42b6541a7c99b6c6 91f357befdf5779d 10dbe3d570b7debe - -[Dec 19 2024 04:03:08] [INFO] [tid 1584] multicast group 1 is allocated. -[Dec 19 2024 04:03:08] [INFO] [tid 1584] successfully setup multicast team for request id 5066616513313770913. -[Dec 19 2024 04:03:08] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:46503cdff1320da1 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:89f7de0f7ca8f5cc Flags:0 Address Base:c008000000000 Address Size:60000000 - -[Dec 19 2024 05:44:57] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:a08081c8f6f667af status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:bebe950509906d00 Flags:0 - -[Dec 19 2024 05:44:57] [INFO] [tid 1584] multicast group 0 is freed. -[Dec 19 2024 05:44:57] [INFO] [tid 1584] successfully released multicast team with handle 13744586961649167616. -[Dec 19 2024 05:45:00] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:7c5f1a73c9e11bff status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:89f7de0f7ca8f5cc Flags:0 - -[Dec 19 2024 05:45:00] [INFO] [tid 1584] multicast group 1 is freed. -[Dec 19 2024 05:45:00] [INFO] [tid 1584] successfully released multicast team with handle 9941658860540982732. -[Dec 19 2024 05:45:22] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:211a576abf67ef47 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe 2d4131df9e28ae09 8e4e8b14fb982022 42b6541a7c99b6c6 aa8f62a63b474b1c - -[Dec 19 2024 05:45:22] [INFO] [tid 1584] multicast group 0 is allocated. -[Dec 19 2024 05:45:22] [INFO] [tid 1584] successfully setup multicast team for request id 2385315068635508551. -[Dec 19 2024 05:45:22] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:211a576abf67ef47 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:fb6bb5925c36ddcf Flags:0 Address Base:c000000000000 Address Size:60000000 - -[Dec 19 2024 05:49:28] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:6ad541ee106b8ac9 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe aa8f62a63b474b1c 42b6541a7c99b6c6 2d4131df9e28ae09 8e4e8b14fb982022 - -[Dec 19 2024 05:49:28] [INFO] [tid 1584] multicast group 1 is allocated. -[Dec 19 2024 05:49:28] [INFO] [tid 1584] successfully setup multicast team for request id 7698131628793236169. -[Dec 19 2024 05:49:28] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:6ad541ee106b8ac9 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:b8231268a7c42f4a Flags:0 Address Base:c008000000000 Address Size:60000000 - -[Dec 19 2024 05:51:53] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:ceb452f47332e769 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:b8231268a7c42f4a Flags:0 - -[Dec 19 2024 05:51:53] [INFO] [tid 1584] multicast group 1 is freed. -[Dec 19 2024 05:51:53] [INFO] [tid 1584] successfully released multicast team with handle 13268469167864164170. -[Dec 19 2024 05:51:53] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:5c5eef9dee70e63b status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:fb6bb5925c36ddcf Flags:0 - -[Dec 19 2024 05:51:53] [INFO] [tid 1584] multicast group 0 is freed. -[Dec 19 2024 05:51:53] [INFO] [tid 1584] successfully released multicast team with handle 18116773566244904399. -[Dec 19 2024 05:55:13] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:27173e309e6735f9 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 ebd867c5b4df712f 91f357befdf5779d 10dbe3d570b7debe aa8f62a63b474b1c 8e4e8b14fb982022 42b6541a7c99b6c6 2d4131df9e28ae09 - -[Dec 19 2024 05:55:13] [INFO] [tid 1584] multicast group 0 is allocated. -[Dec 19 2024 05:55:13] [INFO] [tid 1584] successfully setup multicast team for request id 2816788470480451065. -[Dec 19 2024 05:55:13] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:27173e309e6735f9 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:3dbce3d9d84006d2 Flags:0 Address Base:c000000000000 Address Size:60000000 - -[Dec 19 2024 05:59:13] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:4381a7dee8d7a213 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:203acba0a7853521 10dbe3d570b7debe 91f357befdf5779d ebd867c5b4df712f 2d4131df9e28ae09 aa8f62a63b474b1c 42b6541a7c99b6c6 8e4e8b14fb982022 - -[Dec 19 2024 05:59:13] [INFO] [tid 1584] multicast group 1 is allocated. -[Dec 19 2024 05:59:13] [INFO] [tid 1584] successfully setup multicast team for request id 4864353648367870483. -[Dec 19 2024 05:59:13] [INFO] [tid 1372] Sending inband response message: Message header details: magic Id:adbc request Id:4381a7dee8d7a213 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:972312c9c1e1a7c0 Flags:0 Address Base:c008000000000 Address Size:60000000 - -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:83:00.0 physical id 0 port 34 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:83:00.0 physical id 0 port 34 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:84:00.0 physical id 1 port 35 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:84:00.0 physical id 1 port 35 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:84:00.0 physical id 1 port 42 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:84:00.0 physical id 1 port 42 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:85:00.0 physical id 2 port 0 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:85:00.0 physical id 2 port 0 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:85:00.0 physical id 2 port 1 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:85:00.0 physical id 2 port 1 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:85:00.0 physical id 2 port 19 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:85:00.0 physical id 2 port 19 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:85:00.0 physical id 2 port 33 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:85:00.0 physical id 2 port 33 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:86:00.0 physical id 3 port 33 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:86:00.0 physical id 3 port 33 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:86:00.0 physical id 3 port 36 -[Dec 19 2024 19:17:49] [ERROR] [tid 1374] detected NVSwitch non-fatal error 12028 on fid 0 on NVSwitch pci bus id 00000000:86:00.0 physical id 3 port 36 -[Dec 19 2024 19:18:07] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:c0ffe177f10f3fc7 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:972312c9c1e1a7c0 Flags:0 - -[Dec 19 2024 19:18:07] [INFO] [tid 1584] multicast group 1 is freed. -[Dec 19 2024 19:18:07] [INFO] [tid 1584] successfully released multicast team with handle 10890568981662508992. -[Dec 19 2024 19:18:07] [INFO] [tid 1579] Received an inband message: Message header details: magic Id:adbc request Id:4064dce4c3d2390f status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:3dbce3d9d84006d2 Flags:0 - -[Dec 19 2024 19:18:07] [INFO] [tid 1584] multicast group 0 is freed. -[Dec 19 2024 19:18:07] [INFO] [tid 1584] successfully released multicast team with handle 4448681056710690514. -Fabric Manager Log initializing at: 12/19/2024 22:08:53.808 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Fabric Manager version 535.161.08 is running with the following configuration options -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Logging level = 4 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Logging file name/path = /var/log/fabricmanager.log -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Append to log file = 1 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Max Log file size = 1024 (MBs) -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Use Syslog file = 0 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Fabric Manager communication ports = 16000 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Fabric Mode = 0 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Fabric Mode Restart = 0 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] FM Library communication bind interface = 127.0.0.1 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] FM Library communication unix domain socket = -[Dec 19 2024 22:08:53] [INFO] [tid 1570] FM Library communication port number = 6666 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Continue to run when facing failures = 0 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Option when facing GPU to NVSwitch NVLink failure = 0 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Option when facing NVSwitch to NVSwitch NVLink failure = 0 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Option when facing NVSwitch failure = 0 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] Abort CUDA jobs when FM exits = 1 -[Dec 19 2024 22:08:53] [INFO] [tid 1570] LMDB_LOG: Successfully initialized LMDB -[Dec 19 2024 22:08:54] [INFO] [tid 1570] Connected to 1 node. - -[Dec 19 2024 22:08:54] [INFO] [tid 1570] Getting fabric node FM version info -[Dec 19 2024 22:08:54] [INFO] [tid 1570] getting NVSwitch device information -[Dec 19 2024 22:08:54] [INFO] [tid 1570] detected system topology is based on DGX/HGX H100 -[Dec 19 2024 22:08:54] [INFO] [tid 1570] parsed fabric topology file /usr/share/nvidia/nvswitch/dgxh100_hgxh100_topology successfully. topology name: DGXH100_HGXH100, build time: Mon Feb 27 17:25:17 2023 -. -[Dec 19 2024 22:08:54] [INFO] [tid 1570] fabric topology file /usr/share/nvidia/nvswitch/dgxh100_hgxh100_topology is parsed. -[Dec 19 2024 22:08:54] [INFO] [tid 1570] number of devices specified in topology file NVSwitches: 4, GPUs: 8 -[Dec 19 2024 22:08:54] [INFO] [tid 1570] getting NVSwitch device information -[Dec 19 2024 22:08:54] [INFO] [tid 1570] dumping all the detected NVSwitch information - Index: 00 Physical Id: 0 PCI Bus ID: 00000000:83:00.0 Enabled Link Mask: ffffffff00000000 Arch Type: 3 UUID : SWX-8CDF9499-4961-23EF-5606-A2FA47356210 - Index: 01 Physical Id: 1 PCI Bus ID: 00000000:84:00.0 Enabled Link Mask: ffffffff000000ff Arch Type: 3 UUID : SWX-5D459227-13D4-09CC-6592-B4E9EDF852DA - Index: 02 Physical Id: 2 PCI Bus ID: 00000000:85:00.0 Enabled Link Mask: ffffffff000f000f Arch Type: 3 UUID : SWX-EC1FEEAB-5F16-D04F-450B-7A29EB9A13CB - Index: 03 Physical Id: 3 PCI Bus ID: 00000000:86:00.0 Enabled Link Mask: ffffffff00000000 Arch Type: 3 UUID : SWX-35D5221F-8694-1C35-EE0B-B8C8E9224226 - -[Dec 19 2024 22:08:54] [INFO] [tid 1570] number of GPU base board detected: 1 -[Dec 19 2024 22:08:54] [INFO] [tid 1570] getting NVLink device information -[Dec 19 2024 22:08:54] [INFO] [tid 1570] NVLink Inband feature is enabled. Hence Fabric Manager is not opening and operating on GPUs directly. -[Dec 19 2024 22:08:54] [INFO] [tid 1570] NVLink Autonomous Link Initialization (ALI) feature is enabled. -[Dec 19 2024 22:08:54] [INFO] [tid 1570] start NVSwitch 0/0 routing configuration -[Dec 19 2024 22:08:54] [INFO] [tid 1570] completed NVSwitch 0/0 routing configuration -[Dec 19 2024 22:08:54] [INFO] [tid 1570] start NVSwitch 0/1 routing configuration -[Dec 19 2024 22:08:54] [INFO] [tid 1570] completed NVSwitch 0/1 routing configuration -[Dec 19 2024 22:08:54] [INFO] [tid 1570] start NVSwitch 0/2 routing configuration -[Dec 19 2024 22:08:54] [INFO] [tid 1570] completed NVSwitch 0/2 routing configuration -[Dec 19 2024 22:08:54] [INFO] [tid 1570] start NVSwitch 0/3 routing configuration -[Dec 19 2024 22:08:54] [INFO] [tid 1570] completed NVSwitch 0/3 routing configuration -[Dec 19 2024 22:08:54] [INFO] [tid 1570] Successfully configured all the available NVSwitches to route GPU NVLink traffic. NVLink Peer-to-Peer support will be enabled once the GPUs are successfully registered with the NVLink fabric. -[Dec 19 2024 22:08:55] [INFO] [tid 1799] Received an inband message: Message header details: magic Id:adbc request Id:1812b35be83a37c0 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:9b00 Module Id:6 Uuid:GPU-2b6f61f9-8b00-3b57-d912-b62792120662 Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 19 2024 22:08:55] [INFO] [tid 1804] added GPU with UUID GPU-2b6f61f9-8b00-3b57-d912-b62792120662 based on NVLink Inband GPU probe request. -[Dec 19 2024 22:08:55] [INFO] [tid 1804] fid: 0 Physical ID: 6 Index: 39680 Handle: 190f91b6 PCI Bus ID: 00000000:9B:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-2b6f61f9-8b00-3b57-d912-b62792120662 - -[Dec 19 2024 22:08:55] [INFO] [tid 1576] Sending inband response message: Message header details: magic Id:adbc request Id:1812b35be83a37c0 status:0 type:1 length:66 -Message payload details: Probe response: Handle:33403c39190f91b6 GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8260000000000 GPA Address Range:8000000000 FLA Address:260000000000 FLA Address Range:8000000000 - -[Dec 19 2024 22:08:56] [INFO] [tid 1799] Received an inband message: Message header details: magic Id:adbc request Id:1812b359f3c96780 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:3b00 Module Id:3 Uuid:GPU-47daa2ad-a218-6286-252b-af45f0dfaf00 Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 19 2024 22:08:56] [INFO] [tid 1804] added GPU with UUID GPU-47daa2ad-a218-6286-252b-af45f0dfaf00 based on NVLink Inband GPU probe request. -[Dec 19 2024 22:08:56] [INFO] [tid 1804] fid: 0 Physical ID: 3 Index: 15104 Handle: 2e11efad PCI Bus ID: 00000000:3B:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-47daa2ad-a218-6286-252b-af45f0dfaf00 - -[Dec 19 2024 22:08:56] [INFO] [tid 1576] Sending inband response message: Message header details: magic Id:adbc request Id:1812b359f3c96780 status:0 type:1 length:66 -Message payload details: Probe response: Handle:22b5b2982e11efad GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8230000000000 GPA Address Range:8000000000 FLA Address:230000000000 FLA Address Range:8000000000 - -[Dec 19 2024 22:08:57] [INFO] [tid 1799] Received an inband message: Message header details: magic Id:adbc request Id:1812b35c91f45b58 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:bb00 Module Id:4 Uuid:GPU-980ea11b-91dc-db7b-72ac-0685abc43658 Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 19 2024 22:08:57] [INFO] [tid 1804] added GPU with UUID GPU-980ea11b-91dc-db7b-72ac-0685abc43658 based on NVLink Inband GPU probe request. -[Dec 19 2024 22:08:57] [INFO] [tid 1804] fid: 0 Physical ID: 4 Index: 47872 Handle: eec7dc43 PCI Bus ID: 00000000:BB:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-980ea11b-91dc-db7b-72ac-0685abc43658 - -[Dec 19 2024 22:08:57] [INFO] [tid 1576] Sending inband response message: Message header details: magic Id:adbc request Id:1812b35c91f45b58 status:0 type:1 length:66 -Message payload details: Probe response: Handle:5a23e8e3eec7dc43 GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8240000000000 GPA Address Range:8000000000 FLA Address:240000000000 FLA Address Range:8000000000 - -[Dec 19 2024 22:09:00] [INFO] [tid 1799] Received an inband message: Message header details: magic Id:adbc request Id:1812b35d3861e948 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:cb00 Module Id:5 Uuid:GPU-4ef109e9-e925-4834-2b85-0771b4ef696c Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 19 2024 22:09:00] [INFO] [tid 1804] added GPU with UUID GPU-4ef109e9-e925-4834-2b85-0771b4ef696c based on NVLink Inband GPU probe request. -[Dec 19 2024 22:09:00] [INFO] [tid 1804] fid: 0 Physical ID: 5 Index: 51968 Handle: 8defc83b PCI Bus ID: 00000000:CB:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-4ef109e9-e925-4834-2b85-0771b4ef696c - -[Dec 19 2024 22:09:00] [INFO] [tid 1576] Sending inband response message: Message header details: magic Id:adbc request Id:1812b35d3861e948 status:0 type:1 length:66 -Message payload details: Probe response: Handle:d32573fc8defc83b GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8250000000000 GPA Address Range:8000000000 FLA Address:250000000000 FLA Address Range:8000000000 - -[Dec 19 2024 22:09:00] [INFO] [tid 1799] Received an inband message: Message header details: magic Id:adbc request Id:1812b35a9b2df748 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:4c00 Module Id:0 Uuid:GPU-167d8376-ee13-fca4-938a-a05167c43cdc Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 19 2024 22:09:00] [INFO] [tid 1804] added GPU with UUID GPU-167d8376-ee13-fca4-938a-a05167c43cdc based on NVLink Inband GPU probe request. -[Dec 19 2024 22:09:00] [INFO] [tid 1804] fid: 0 Physical ID: 0 Index: 19456 Handle: ffd350b1 PCI Bus ID: 00000000:4C:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-167d8376-ee13-fca4-938a-a05167c43cdc - -[Dec 19 2024 22:09:00] [INFO] [tid 1576] Sending inband response message: Message header details: magic Id:adbc request Id:1812b35a9b2df748 status:0 type:1 length:66 -Message payload details: Probe response: Handle:4e2a94ceffd350b1 GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8200000000000 GPA Address Range:8000000000 FLA Address:200000000000 FLA Address Range:8000000000 - -[Dec 19 2024 22:09:01] [INFO] [tid 1799] Received an inband message: Message header details: magic Id:adbc request Id:1812b35b4176c0f0 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:5d00 Module Id:2 Uuid:GPU-afe9f2ed-9dd0-572c-3cd8-b42b58b3bc99 Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 19 2024 22:09:01] [INFO] [tid 1804] added GPU with UUID GPU-afe9f2ed-9dd0-572c-3cd8-b42b58b3bc99 based on NVLink Inband GPU probe request. -[Dec 19 2024 22:09:01] [INFO] [tid 1804] fid: 0 Physical ID: 2 Index: 23808 Handle: c8889154 PCI Bus ID: 00000000:5D:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-afe9f2ed-9dd0-572c-3cd8-b42b58b3bc99 - -[Dec 19 2024 22:09:01] [INFO] [tid 1576] Sending inband response message: Message header details: magic Id:adbc request Id:1812b35b4176c0f0 status:0 type:1 length:66 -Message payload details: Probe response: Handle:4e3dea39c8889154 GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8220000000000 GPA Address Range:8000000000 FLA Address:220000000000 FLA Address Range:8000000000 - -[Dec 19 2024 22:09:02] [INFO] [tid 1799] Received an inband message: Message header details: magic Id:adbc request Id:1812b35ddf812a28 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:db00 Module Id:7 Uuid:GPU-b0fe4996-1f7c-91f6-0ccd-08ae3b7e23c5 Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 19 2024 22:09:02] [INFO] [tid 1804] added GPU with UUID GPU-b0fe4996-1f7c-91f6-0ccd-08ae3b7e23c5 based on NVLink Inband GPU probe request. -[Dec 19 2024 22:09:02] [INFO] [tid 1804] fid: 0 Physical ID: 7 Index: 56064 Handle: 1f85f6ed PCI Bus ID: 00000000:DB:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-b0fe4996-1f7c-91f6-0ccd-08ae3b7e23c5 - -[Dec 19 2024 22:09:02] [INFO] [tid 1576] Sending inband response message: Message header details: magic Id:adbc request Id:1812b35ddf812a28 status:0 type:1 length:66 -Message payload details: Probe response: Handle:77c68cd81f85f6ed GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8270000000000 GPA Address Range:8000000000 FLA Address:270000000000 FLA Address Range:8000000000 - -[Dec 19 2024 22:09:03] [INFO] [tid 1799] Received an inband message: Message header details: magic Id:adbc request Id:1812b3594e37f9d8 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:1900 Module Id:1 Uuid:GPU-ee7954ec-da95-fd7a-ddfd-c6b32cc2f8aa Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 19 2024 22:09:03] [INFO] [tid 1804] added GPU with UUID GPU-ee7954ec-da95-fd7a-ddfd-c6b32cc2f8aa based on NVLink Inband GPU probe request. -[Dec 19 2024 22:09:03] [INFO] [tid 1804] fid: 0 Physical ID: 1 Index: 6400 Handle: 766d5cb5 PCI Bus ID: 00000000:19:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-ee7954ec-da95-fd7a-ddfd-c6b32cc2f8aa - -[Dec 19 2024 22:09:03] [INFO] [tid 1576] Sending inband response message: Message header details: magic Id:adbc request Id:1812b3594e37f9d8 status:0 type:1 length:66 -Message payload details: Probe response: Handle:e16e5bc5766d5cb5 GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8210000000000 GPA Address Range:8000000000 FLA Address:210000000000 FLA Address Range:8000000000 - -Fabric Manager Log initializing at: 12/20/2024 00:36:34.217 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Fabric Manager version 535.161.08 is running with the following configuration options -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Logging level = 4 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Logging file name/path = /var/log/fabricmanager.log -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Append to log file = 1 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Max Log file size = 1024 (MBs) -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Use Syslog file = 0 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Fabric Manager communication ports = 16000 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Fabric Mode = 0 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Fabric Mode Restart = 0 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] FM Library communication bind interface = 127.0.0.1 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] FM Library communication unix domain socket = -[Dec 20 2024 00:36:34] [INFO] [tid 1597] FM Library communication port number = 6666 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Continue to run when facing failures = 0 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Option when facing GPU to NVSwitch NVLink failure = 0 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Option when facing NVSwitch to NVSwitch NVLink failure = 0 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Option when facing NVSwitch failure = 0 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] Abort CUDA jobs when FM exits = 1 -[Dec 20 2024 00:36:34] [INFO] [tid 1597] LMDB_LOG: Successfully initialized LMDB -[Dec 20 2024 00:36:35] [INFO] [tid 1597] Connected to 1 node. - -[Dec 20 2024 00:36:35] [INFO] [tid 1597] Getting fabric node FM version info -[Dec 20 2024 00:36:35] [INFO] [tid 1597] getting NVSwitch device information -[Dec 20 2024 00:36:35] [INFO] [tid 1597] detected system topology is based on DGX/HGX H100 -[Dec 20 2024 00:36:35] [INFO] [tid 1597] parsed fabric topology file /usr/share/nvidia/nvswitch/dgxh100_hgxh100_topology successfully. topology name: DGXH100_HGXH100, build time: Mon Feb 27 17:25:17 2023 -. -[Dec 20 2024 00:36:35] [INFO] [tid 1597] fabric topology file /usr/share/nvidia/nvswitch/dgxh100_hgxh100_topology is parsed. -[Dec 20 2024 00:36:35] [INFO] [tid 1597] number of devices specified in topology file NVSwitches: 4, GPUs: 8 -[Dec 20 2024 00:36:35] [INFO] [tid 1597] getting NVSwitch device information -[Dec 20 2024 00:36:35] [INFO] [tid 1597] dumping all the detected NVSwitch information - Index: 00 Physical Id: 0 PCI Bus ID: 00000000:83:00.0 Enabled Link Mask: ffffffff00000000 Arch Type: 3 UUID : SWX-8CDF9499-4961-23EF-5606-A2FA47356210 - Index: 01 Physical Id: 1 PCI Bus ID: 00000000:84:00.0 Enabled Link Mask: ffffffff000000ff Arch Type: 3 UUID : SWX-5D459227-13D4-09CC-6592-B4E9EDF852DA - Index: 02 Physical Id: 2 PCI Bus ID: 00000000:85:00.0 Enabled Link Mask: ffffffff000f000f Arch Type: 3 UUID : SWX-EC1FEEAB-5F16-D04F-450B-7A29EB9A13CB - Index: 03 Physical Id: 3 PCI Bus ID: 00000000:86:00.0 Enabled Link Mask: ffffffff00000000 Arch Type: 3 UUID : SWX-35D5221F-8694-1C35-EE0B-B8C8E9224226 - -[Dec 20 2024 00:36:35] [INFO] [tid 1597] number of GPU base board detected: 1 -[Dec 20 2024 00:36:35] [INFO] [tid 1597] getting NVLink device information -[Dec 20 2024 00:36:35] [INFO] [tid 1597] NVLink Inband feature is enabled. Hence Fabric Manager is not opening and operating on GPUs directly. -[Dec 20 2024 00:36:35] [INFO] [tid 1597] NVLink Autonomous Link Initialization (ALI) feature is enabled. -[Dec 20 2024 00:36:35] [INFO] [tid 1597] start NVSwitch 0/0 routing configuration -[Dec 20 2024 00:36:35] [INFO] [tid 1597] completed NVSwitch 0/0 routing configuration -[Dec 20 2024 00:36:35] [INFO] [tid 1597] start NVSwitch 0/1 routing configuration -[Dec 20 2024 00:36:35] [INFO] [tid 1597] completed NVSwitch 0/1 routing configuration -[Dec 20 2024 00:36:35] [INFO] [tid 1597] start NVSwitch 0/2 routing configuration -[Dec 20 2024 00:36:35] [INFO] [tid 1597] completed NVSwitch 0/2 routing configuration -[Dec 20 2024 00:36:35] [INFO] [tid 1597] start NVSwitch 0/3 routing configuration -[Dec 20 2024 00:36:35] [INFO] [tid 1597] completed NVSwitch 0/3 routing configuration -[Dec 20 2024 00:36:35] [INFO] [tid 1597] Successfully configured all the available NVSwitches to route GPU NVLink traffic. NVLink Peer-to-Peer support will be enabled once the GPUs are successfully registered with the NVLink fabric. -[Dec 20 2024 00:36:36] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:1812bb68f4759378 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:3b00 Module Id:3 Uuid:GPU-47daa2ad-a218-6286-252b-af45f0dfaf00 Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 20 2024 00:36:36] [INFO] [tid 1822] added GPU with UUID GPU-47daa2ad-a218-6286-252b-af45f0dfaf00 based on NVLink Inband GPU probe request. -[Dec 20 2024 00:36:36] [INFO] [tid 1822] fid: 0 Physical ID: 3 Index: 15104 Handle: f444cef PCI Bus ID: 00000000:3B:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-47daa2ad-a218-6286-252b-af45f0dfaf00 - -[Dec 20 2024 00:36:36] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:1812bb68f4759378 status:0 type:1 length:66 -Message payload details: Probe response: Handle:f5620ab00f444cef GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8230000000000 GPA Address Range:8000000000 FLA Address:230000000000 FLA Address Range:8000000000 - -[Dec 20 2024 00:36:37] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:1812bb6b957178c0 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:bb00 Module Id:4 Uuid:GPU-980ea11b-91dc-db7b-72ac-0685abc43658 Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 20 2024 00:36:37] [INFO] [tid 1822] added GPU with UUID GPU-980ea11b-91dc-db7b-72ac-0685abc43658 based on NVLink Inband GPU probe request. -[Dec 20 2024 00:36:37] [INFO] [tid 1822] fid: 0 Physical ID: 4 Index: 47872 Handle: 4f9bcdea PCI Bus ID: 00000000:BB:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-980ea11b-91dc-db7b-72ac-0685abc43658 - -[Dec 20 2024 00:36:37] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:1812bb6b957178c0 status:0 type:1 length:66 -Message payload details: Probe response: Handle:9f8386104f9bcdea GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8240000000000 GPA Address Range:8000000000 FLA Address:240000000000 FLA Address Range:8000000000 - -[Dec 20 2024 00:36:39] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:1812bb699e6ad2d0 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:4c00 Module Id:0 Uuid:GPU-167d8376-ee13-fca4-938a-a05167c43cdc Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 20 2024 00:36:39] [INFO] [tid 1822] added GPU with UUID GPU-167d8376-ee13-fca4-938a-a05167c43cdc based on NVLink Inband GPU probe request. -[Dec 20 2024 00:36:39] [INFO] [tid 1822] fid: 0 Physical ID: 0 Index: 19456 Handle: 1edcff95 PCI Bus ID: 00000000:4C:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-167d8376-ee13-fca4-938a-a05167c43cdc - -[Dec 20 2024 00:36:39] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:1812bb699e6ad2d0 status:0 type:1 length:66 -Message payload details: Probe response: Handle:f624355f1edcff95 GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8200000000000 GPA Address Range:8000000000 FLA Address:200000000000 FLA Address Range:8000000000 - -[Dec 20 2024 00:36:40] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:1812bb6c3c1803e0 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:cb00 Module Id:5 Uuid:GPU-4ef109e9-e925-4834-2b85-0771b4ef696c Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 20 2024 00:36:40] [INFO] [tid 1822] added GPU with UUID GPU-4ef109e9-e925-4834-2b85-0771b4ef696c based on NVLink Inband GPU probe request. -[Dec 20 2024 00:36:40] [INFO] [tid 1822] fid: 0 Physical ID: 5 Index: 51968 Handle: cc81c33b PCI Bus ID: 00000000:CB:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-4ef109e9-e925-4834-2b85-0771b4ef696c - -[Dec 20 2024 00:36:40] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:1812bb6c3c1803e0 status:0 type:1 length:66 -Message payload details: Probe response: Handle:b26daeafcc81c33b GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8250000000000 GPA Address Range:8000000000 FLA Address:250000000000 FLA Address Range:8000000000 - -[Dec 20 2024 00:36:42] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:1812bb6a45d84e40 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:5d00 Module Id:2 Uuid:GPU-afe9f2ed-9dd0-572c-3cd8-b42b58b3bc99 Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 20 2024 00:36:42] [INFO] [tid 1822] added GPU with UUID GPU-afe9f2ed-9dd0-572c-3cd8-b42b58b3bc99 based on NVLink Inband GPU probe request. -[Dec 20 2024 00:36:42] [INFO] [tid 1822] fid: 0 Physical ID: 2 Index: 23808 Handle: a64309d8 PCI Bus ID: 00000000:5D:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-afe9f2ed-9dd0-572c-3cd8-b42b58b3bc99 - -[Dec 20 2024 00:36:42] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:1812bb6a45d84e40 status:0 type:1 length:66 -Message payload details: Probe response: Handle:d2daee64a64309d8 GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8220000000000 GPA Address Range:8000000000 FLA Address:220000000000 FLA Address Range:8000000000 - -[Dec 20 2024 00:36:43] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:1812bb6ce39d9c60 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:db00 Module Id:7 Uuid:GPU-b0fe4996-1f7c-91f6-0ccd-08ae3b7e23c5 Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 20 2024 00:36:43] [INFO] [tid 1822] added GPU with UUID GPU-b0fe4996-1f7c-91f6-0ccd-08ae3b7e23c5 based on NVLink Inband GPU probe request. -[Dec 20 2024 00:36:43] [INFO] [tid 1822] fid: 0 Physical ID: 7 Index: 56064 Handle: 9026da78 PCI Bus ID: 00000000:DB:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-b0fe4996-1f7c-91f6-0ccd-08ae3b7e23c5 - -[Dec 20 2024 00:36:43] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:1812bb6ce39d9c60 status:0 type:1 length:66 -Message payload details: Probe response: Handle:a354b7d09026da78 GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8270000000000 GPA Address Range:8000000000 FLA Address:270000000000 FLA Address Range:8000000000 - -[Dec 20 2024 00:36:44] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:1812bb68497f1210 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:1900 Module Id:1 Uuid:GPU-ee7954ec-da95-fd7a-ddfd-c6b32cc2f8aa Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 20 2024 00:36:44] [INFO] [tid 1822] added GPU with UUID GPU-ee7954ec-da95-fd7a-ddfd-c6b32cc2f8aa based on NVLink Inband GPU probe request. -[Dec 20 2024 00:36:44] [INFO] [tid 1822] fid: 0 Physical ID: 1 Index: 6400 Handle: 8c0f88a5 PCI Bus ID: 00000000:19:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-ee7954ec-da95-fd7a-ddfd-c6b32cc2f8aa - -[Dec 20 2024 00:36:44] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:1812bb68497f1210 status:0 type:1 length:66 -Message payload details: Probe response: Handle:b0f9ec7a8c0f88a5 GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8210000000000 GPA Address Range:8000000000 FLA Address:210000000000 FLA Address Range:8000000000 - -[Dec 20 2024 00:36:45] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:1812bb6aef48ae90 status:0 type:0 length:4d -Message payload details: Probe request: Pci Info:9b00 Module Id:6 Uuid:GPU-2b6f61f9-8b00-3b57-d912-b62792120662 Discovered LinkMask:3ffff Enabled LinkMask:3ffff Cap Mask:0 - -[Dec 20 2024 00:36:45] [INFO] [tid 1822] added GPU with UUID GPU-2b6f61f9-8b00-3b57-d912-b62792120662 based on NVLink Inband GPU probe request. -[Dec 20 2024 00:36:45] [INFO] [tid 1822] fid: 0 Physical ID: 6 Index: 39680 Handle: 97e42f7b PCI Bus ID: 00000000:9B:00.0 Discovered Link Mask: 3ffff Enabled Link Mask: 3ffff UUID: GPU-2b6f61f9-8b00-3b57-d912-b62792120662 - -[Dec 20 2024 00:36:45] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:1812bb6aef48ae90 status:0 type:1 length:66 -Message payload details: Probe response: Handle:6df2dee697e42f7b GfId:0 FM Caps:7 Fabric Partition Id:ffff GPA Address:8260000000000 GPA Address Range:8000000000 FLA Address:260000000000 FLA Address Range:8000000000 - -[Dec 20 2024 03:15:35] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:f644e674334e8af8 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:b0f9ec7a8c0f88a5 f5620ab00f444cef f624355f1edcff95 d2daee64a64309d8 b26daeafcc81c33b 9f8386104f9bcdea a354b7d09026da78 6df2dee697e42f7b - -[Dec 20 2024 03:15:35] [INFO] [tid 1822] multicast group 0 is allocated. -[Dec 20 2024 03:15:35] [INFO] [tid 1822] successfully setup multicast team for request id 17745561818497977080. -[Dec 20 2024 03:15:36] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:f644e674334e8af8 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:6ddfca266c13893c Flags:0 Address Base:c000000000000 Address Size:60000000 - -[Dec 20 2024 03:21:54] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:abb2d0ea13167103 status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:b0f9ec7a8c0f88a5 f5620ab00f444cef f624355f1edcff95 d2daee64a64309d8 a354b7d09026da78 6df2dee697e42f7b 9f8386104f9bcdea b26daeafcc81c33b - -[Dec 20 2024 03:21:54] [INFO] [tid 1822] multicast group 1 is allocated. -[Dec 20 2024 03:21:54] [INFO] [tid 1822] successfully setup multicast team for request id 12372180830101336323. -[Dec 20 2024 03:21:54] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:abb2d0ea13167103 status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:238c52d65d48b8a9 Flags:0 Address Base:c008000000000 Address Size:60000000 - -[Dec 20 2024 03:23:07] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:f25ed53748cba3f3 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:238c52d65d48b8a9 Flags:0 - -[Dec 20 2024 03:23:07] [INFO] [tid 1822] multicast group 1 is freed. -[Dec 20 2024 03:23:07] [INFO] [tid 1822] successfully released multicast team with handle 2561513368708495529. -[Dec 20 2024 03:23:07] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:ac88d1d464e81558 status:0 type:4 length:14 -Message payload details:Team release request: Team Handle:6ddfca266c13893c Flags:0 - -[Dec 20 2024 03:23:07] [INFO] [tid 1822] multicast group 0 is freed. -[Dec 20 2024 03:23:07] [INFO] [tid 1822] successfully released multicast team with handle 7917268936311408956. -[Dec 20 2024 03:23:32] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:6989e23eaf02591f status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:b0f9ec7a8c0f88a5 f5620ab00f444cef f624355f1edcff95 d2daee64a64309d8 a354b7d09026da78 9f8386104f9bcdea b26daeafcc81c33b 6df2dee697e42f7b - -[Dec 20 2024 03:23:32] [INFO] [tid 1822] multicast group 0 is allocated. -[Dec 20 2024 03:23:32] [INFO] [tid 1822] successfully setup multicast team for request id 7604858204643809567. -[Dec 20 2024 03:23:32] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:6989e23eaf02591f status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:c622f03e987d5484 Flags:0 Address Base:c000000000000 Address Size:60000000 - -[Dec 20 2024 03:27:28] [INFO] [tid 1817] Received an inband message: Message header details: magic Id:adbc request Id:5ab42d347932625e status:0 type:2 length:56 -Message payload details:Team setup request: Allocation Size:60000000 Flags:0 Number of GPUs:8 GPU Handles:b0f9ec7a8c0f88a5 f5620ab00f444cef f624355f1edcff95 d2daee64a64309d8 9f8386104f9bcdea b26daeafcc81c33b 6df2dee697e42f7b a354b7d09026da78 - -[Dec 20 2024 03:27:28] [INFO] [tid 1822] multicast group 1 is allocated. -[Dec 20 2024 03:27:28] [INFO] [tid 1822] successfully setup multicast team for request id 6535898662616326750. -[Dec 20 2024 03:27:28] [INFO] [tid 1603] Sending inband response message: Message header details: magic Id:adbc request Id:5ab42d347932625e status:0 type:3 length:24 -Message payload details:Team setup response: Team Handle:ef360e92f05ab615 Flags:0 Address Base:c008000000000 Address Size:60000000 \ No newline at end of file diff --git a/pkg/query/log/tail/testdata/kubelet.0.log b/pkg/query/log/tail/testdata/kubelet.0.log deleted file mode 100644 index 7c7e7e38..00000000 --- a/pkg/query/log/tail/testdata/kubelet.0.log +++ /dev/null @@ -1,20 +0,0 @@ -W0326 15:43:36.380682 12398 reflector.go:458] vendor/k8s.io/client-go/informers/factory.go:150: watch of *v1.RuntimeClass ended with: an error on the server ("unable to decode an event from the watch stream: http2: client connection lost") has prevented the request from succeeding -W0326 15:43:36.380691 12398 reflector.go:458] object-"calico-system"/"node-certs": watch of *v1.Secret ended with: an error on the server ("unable to decode an event from the watch stream: http2: client connection lost") has prevented the request from succeeding -W0326 15:43:36.380694 12398 reflector.go:458] object-"vector"/"vector": watch of *v1.ConfigMap ended with: an error on the server ("unable to decode an event from the watch stream: http2: client connection lost") has prevented the request from succeeding -W0326 15:43:36.380721 12398 reflector.go:458] object-"calico-system"/"kube-root-ca.crt": watch of *v1.ConfigMap ended with: an error on the server ("unable to decode an event from the watch stream: http2: client connection lost") has prevented the request from succeeding -E0326 15:43:36.383485 12398 kubelet_node_status.go:540] "Error updating node status, will retry" err="error getting node \"fargate-ip-10-0-82-17.us-west-2.compute.internal\": Get \"https://39F24B13E46CFBFC72E3786FBFC105C0.gr7.us-west-2.eks.amazonaws.com/api/v1/nodes/fargate-ip-10-0-82-17.us-west-2.compute.internal?timeout=10s\": http2: client connection lost" -E0326 15:43:36.380684 12398 event.go:289] Unable to write event: '&v1.Event{TypeMeta:v1.TypeMeta{Kind:"", APIVersion:""}, ObjectMeta:v1.ObjectMeta{Name:"calico-node-satellite-7w2m9.17c05ae66aa89100", GenerateName:"", Namespace:"calico-system", SelfLink:"", UID:"", ResourceVersion:"", Generation:0, CreationTimestamp:time.Date(1, time.January, 1, 0, 0, 0, 0, time.UTC), DeletionTimestamp:, DeletionGracePeriodSeconds:(*int64)(nil), Labels:map[string]string(nil), Annotations:map[string]string(nil), OwnerReferences:[]v1.OwnerReference(nil), Finalizers:[]string(nil), ManagedFields:[]v1.ManagedFieldsEntry(nil)}, InvolvedObject:v1.ObjectReference{Kind:"Pod", Namespace:"calico-system", Name:"calico-node-satellite-7w2m9", UID:"31a07e7f-fa9c-44f5-8102-a6f9852a3cec", APIVersion:"v1", ResourceVersion:"384104230", FieldPath:"spec.containers{calico-node}"}, Reason:"Unhealthy", Message:"Readiness probe failed: calico/node is not ready: felix is not ready: readiness probe reporting 503\nW0326 15:43:24.069636 204878 feature_gate.go:241] Setting GA feature gate ServiceInternalTrafficPolicy=true. It will be removed in a future release.\n", Source:v1.EventSource{Component:"kubelet", Host:"fargate-ip-10-0-82-17.us-west-2.compute.internal"}, FirstTimestamp:time.Date(2024, time.March, 26, 15, 43, 24, 79198464, time.Local), LastTimestamp:time.Date(2024, time.March, 26, 15, 43, 24, 79198464, time.Local), Count:1, Type:"Warning", EventTime:time.Date(1, time.January, 1, 0, 0, 0, 0, time.UTC), Series:(*v1.EventSeries)(nil), Action:"", Related:(*v1.ObjectReference)(nil), ReportingController:"kubelet", ReportingInstance:"fargate-ip-10-0-82-17.us-west-2.compute.internal"}': 'Post "https://39F24B13E46CFBFC72E3786FBFC105C0.gr7.us-west-2.eks.amazonaws.com/api/v1/namespaces/calico-system/events": http2: client connection lost'(may retry after sleeping) -E0326 15:43:36.673499 12398 controller.go:193] "Failed to update lease" err="Operation cannot be fulfilled on leases.coordination.k8s.io \"fargate-ip-10-0-82-17.us-west-2.compute.internal\": the object has been modified; please apply your changes to the latest version and try again" -I0326 15:43:36.764785 12398 controller.go:116] "failed to update lease using latest lease, fallback to ensure lease" err="failed 5 attempts to update lease" -E0326 15:49:55.526004 12398 server.go:310] "Unable to authenticate the request due to an error" err="context canceled" -E0326 15:50:03.989811 12398 remote_runtime.go:496] "ExecSync cmd from runtime service failed" err="rpc error: code = DeadlineExceeded desc = context deadline exceeded" containerID="6070a04505cf5eb7b64c57a055c41b77e08d482c9a41e24de029149729cacb59" cmd=["/bin/calico-node","-felix-live"] -I0326 15:50:03.989963 12398 prober.go:107] "Probe failed" probeType="Liveness" pod="calico-system/calico-node-satellite-7w2m9" podUID="31a07e7f-fa9c-44f5-8102-a6f9852a3cec" containerName="calico-node" probeResult="failure" output="command \"/bin/calico-node -felix-live\" timed out" -W0326 15:53:37.321150 12398 transport.go:301] Unable to cancel request for *otelhttp.Transport -E0326 15:53:37.321360 12398 controller.go:193] "Failed to update lease" err="Put \"https://39F24B13E46CFBFC72E3786FBFC105C0.gr7.us-west-2.eks.amazonaws.com/apis/coordination.k8s.io/v1/namespaces/kube-node-lease/leases/fargate-ip-10-0-82-17.us-west-2.compute.internal?timeout=10s\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)" -W0326 15:53:38.255488 12398 transport.go:301] Unable to cancel request for *otelhttp.Transport -E0326 15:53:38.255692 12398 kubelet_node_status.go:540] "Error updating node status, will retry" err="error getting node \"fargate-ip-10-0-82-17.us-west-2.compute.internal\": Get \"https://39F24B13E46CFBFC72E3786FBFC105C0.gr7.us-west-2.eks.amazonaws.com/api/v1/nodes/fargate-ip-10-0-82-17.us-west-2.compute.internal?resourceVersion=0&timeout=10s\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)" -W0326 15:53:47.321635 12398 transport.go:301] Unable to cancel request for *otelhttp.Transport -E0326 15:53:47.321892 12398 controller.go:193] "Failed to update lease" err="Put \"https://39F24B13E46CFBFC72E3786FBFC105C0.gr7.us-west-2.eks.amazonaws.com/apis/coordination.k8s.io/v1/namespaces/kube-node-lease/leases/fargate-ip-10-0-82-17.us-west-2.compute.internal?timeout=10s\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)" -W0326 15:53:48.256535 12398 transport.go:301] Unable to cancel request for *otelhttp.Transport -E0326 15:53:48.256802 12398 kubelet_node_status.go:540] "Error updating node status, will retry" err="error getting node \"fargate-ip-10-0-82-17.us-west-2.compute.internal\": Get \"https://39F24B13E46CFBFC72E3786FBFC105C0.gr7.us-west-2.eks.amazonaws.com/api/v1/nodes/fargate-ip-10-0-82-17.us-west-2.compute.internal?timeout=10s\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)" -E0326 15:53:48.606374 12398 controller.go:193] "Failed to update lease" err="Operation cannot be fulfilled on leases.coordination.k8s.io \"fargate-ip-10-0-82-17.us-west-2.compute.internal\": the object has been modified; please apply your changes to the latest version and try again" \ No newline at end of file diff --git a/pkg/server/server.go b/pkg/server/server.go index 8951c6e4..33fff6de 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -37,9 +37,9 @@ import ( nvidia_ecc "github.com/leptonai/gpud/components/accelerator/nvidia/ecc" nvidia_ecc_id "github.com/leptonai/gpud/components/accelerator/nvidia/ecc/id" nvidia_error "github.com/leptonai/gpud/components/accelerator/nvidia/error" - nvidia_error_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid" + nvidia_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid" nvidia_component_error_sxid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid/id" - nvidia_error_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid" + nvidia_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid" nvidia_component_error_xid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/id" nvidia_fabric_manager "github.com/leptonai/gpud/components/accelerator/nvidia/fabric-manager" nvidia_fabric_manager_id "github.com/leptonai/gpud/components/accelerator/nvidia/fabric-manager/id" @@ -113,7 +113,6 @@ import ( nvidia_query "github.com/leptonai/gpud/pkg/nvidia-query" nvidia_query_nvml "github.com/leptonai/gpud/pkg/nvidia-query/nvml" query_config "github.com/leptonai/gpud/pkg/query/config" - query_log_state "github.com/leptonai/gpud/pkg/query/log/state" "github.com/leptonai/gpud/pkg/session" "github.com/leptonai/gpud/pkg/sqlite" ) @@ -229,10 +228,6 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID return nil, fmt.Errorf("api version mismatch: %s (only supports v1)", ver) } - if err := query_log_state.CreateTableLogFileSeekInfo(ctx, dbRW); err != nil { - return nil, fmt.Errorf("failed to create query log state table: %w", err) - } - if err := components_metrics_state.CreateTableMetrics(ctx, dbRW, components_metrics_state.DefaultTableName); err != nil { return nil, fmt.Errorf("failed to create metrics table: %w", err) } @@ -519,11 +514,11 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID allComponents = append(allComponents, c) case nvidia_component_error_xid_id.Name: - allComponents = append(allComponents, nvidia_error_xid.New(ctx, dbRW, dbRO)) + allComponents = append(allComponents, nvidia_xid.New(ctx, dbRW, dbRO)) case nvidia_component_error_sxid_id.Name: // db object to read sxid events (read-only, writes are done in poller) - allComponents = append(allComponents, nvidia_error_sxid.New(ctx, dbRW, dbRO)) + allComponents = append(allComponents, nvidia_sxid.New(ctx, dbRW, dbRO)) case nvidia_hw_slowdown_id.Name: cfg := nvidia_common.Config{Query: defaultQueryCfg, ToolOverwrites: options.ToolOverwrites}