Skip to content

Commit

Permalink
feat(query/log): remove, and simplify "scan" dmesg func for one-off o…
Browse files Browse the repository at this point in the history
…perations, remove "gpud logs" (#487)
  • Loading branch information
gyuho authored Mar 4, 2025
1 parent 6adcdad commit 6064ecb
Show file tree
Hide file tree
Showing 35 changed files with 70 additions and 4,311 deletions.
21 changes: 0 additions & 21 deletions cmd/gpud/command/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -473,27 +473,6 @@ sudo rm /etc/systemd/system/gpud.service
},
},
},
{
Name: "logs",
Aliases: []string{"log", "l"},

Usage: "checks the gpud logs",
Action: cmdLogs,
Flags: []cli.Flag{
&cli.IntFlag{
Name: "lines,n",
Usage: "set the number to tail logs",
Destination: &tailLines,
Value: 100,
},
&cli.StringFlag{
Name: "log-file",
Usage: "set the log file path (set empty to stdout/stderr)",
Destination: &logFile,
Value: "",
},
},
},

{
Name: "is-nvidia",
Expand Down
44 changes: 0 additions & 44 deletions cmd/gpud/command/logs.go

This file was deleted.

2 changes: 1 addition & 1 deletion cmd/gpud/command/up.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func cmdUp(cliContext *cli.Context) (retErr error) {
return err
}

fmt.Printf("%s successfully started gpud (run 'gpud status' or 'gpud logs' for checking status)\n", checkMark)
fmt.Printf("%s successfully started gpud (run 'gpud status' for checking status)\n", checkMark)
return nil
}

Expand Down
4 changes: 1 addition & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ require (
github.com/hdevalence/ed25519consensus v0.2.0
github.com/mattn/go-sqlite3 v1.14.25-0.20241209043634-7658c06970ec
github.com/mitchellh/go-homedir v1.1.0
github.com/nxadm/tail v1.4.11
github.com/olekukonko/tablewriter v0.0.5
github.com/onsi/ginkgo/v2 v2.21.0
github.com/onsi/gomega v1.35.1
Expand All @@ -43,7 +42,6 @@ require (
k8s.io/apimachinery v0.32.0
k8s.io/client-go v0.29.1
k8s.io/cri-api v0.32.0
k8s.io/utils v0.0.0-20241210054802-24370beab758
sigs.k8s.io/yaml v1.4.0
tailscale.com v1.68.2
)
Expand Down Expand Up @@ -164,10 +162,10 @@ require (
google.golang.org/genproto/googleapis/rpc v0.0.0-20250106144421-5f5ef82da422 // indirect
google.golang.org/protobuf v1.36.1 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
gotest.tools/v3 v3.5.1 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/utils v0.0.0-20241210054802-24370beab758 // indirect
nhooyr.io/websocket v1.8.10 // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect
Expand Down
6 changes: 0 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M=
github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
Expand Down Expand Up @@ -241,8 +240,6 @@ github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7P
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/nxadm/tail v1.4.11 h1:8feyoE3OzPrcshW5/MJ4sGESc5cqmGkGCWlco4l0bqY=
github.com/nxadm/tail v1.4.11/go.mod h1:OTaG3NK980DZzxbRq6lEuzgU+mug70nY11sMd4JXXHc=
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM=
Expand Down Expand Up @@ -407,7 +404,6 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220817070843-5a390386f1f2/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.4.1-0.20230131160137-e7d7f63158de/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
Expand Down Expand Up @@ -458,8 +454,6 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc=
gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
Expand Down
38 changes: 3 additions & 35 deletions pkg/diagnose/diagnose.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,11 @@ import (

"sigs.k8s.io/yaml"

nvidia_component_error_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid"
nvidia_component_error_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid"
pkg_dmesg "github.com/leptonai/gpud/pkg/dmesg"
"github.com/leptonai/gpud/pkg/host"
"github.com/leptonai/gpud/pkg/log"
nvidia_query "github.com/leptonai/gpud/pkg/nvidia-query"
"github.com/leptonai/gpud/pkg/process"
query_log_common "github.com/leptonai/gpud/pkg/query/log/common"
query_log_tail "github.com/leptonai/gpud/pkg/query/log/tail"
pkd_systemd "github.com/leptonai/gpud/pkg/systemd"
)

Expand Down Expand Up @@ -144,44 +140,16 @@ func run(ctx context.Context, dir string, opts ...OpOption) error {
nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx)
if nvidiaInstalled && err == nil {
fmt.Printf("%s scanning dmesg with regexes\n", inProgress)
matched, err := query_log_tail.Scan(
ctx,
query_log_tail.WithDedup(true),
query_log_tail.WithCommands(pkg_dmesg.DefaultDmesgScanCommands),
query_log_tail.WithLinesToTail(5000),
query_log_tail.WithMatchFunc(
func(line string) (string, string) {
xidErr := nvidia_component_error_xid.Match(line)
if xidErr != nil {
return "xid found", ""
}
return "", "" // no match
},
func(line string) (string, string) {
sxidErr := nvidia_component_error_sxid.Match(line)
if sxidErr != nil {
return "sxid found", ""
}
return "", "" // no match
},
),
query_log_tail.WithExtractTime(func(l []byte) (time.Time, []byte, error) {
dm := pkg_dmesg.ParseDmesgLine(string(l))
return dm.Timestamp, l, nil
}),
query_log_tail.WithProcessMatched(func(time time.Time, line []byte, matched *query_log_common.Filter) {
o.CheckSummary = append(o.CheckSummary, fmt.Sprintf("dmesg match: %s", string(line)))
}),
)
issueCnt, err := scanDmesg(ctx)
if err != nil {
o.Results = append(o.Results, CommandResult{
Command: strings.Join(pkg_dmesg.DefaultDmesgScanCommands[0], " "),
Error: err.Error(),
})
} else if matched == 0 {
} else if issueCnt == 0 {
o.CheckSummary = append(o.CheckSummary, "dmesg scan passed")
} else {
o.CheckSummary = append(o.CheckSummary, fmt.Sprintf("dmesg scan detected %d issues", matched))
o.CheckSummary = append(o.CheckSummary, fmt.Sprintf("dmesg scan detected %d issues", issueCnt))
}
}

Expand Down
106 changes: 61 additions & 45 deletions pkg/diagnose/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ import (
"runtime"
"time"

nvidia_component_error_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid"
nvidia_component_error_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid"
nvidia_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid"
nvidia_xid "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid"
nvidia_component_error_xid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/id"
nvidia_hw_slowdown_id "github.com/leptonai/gpud/components/accelerator/nvidia/hw-slowdown/id"
"github.com/leptonai/gpud/pkg/disk"
Expand All @@ -24,8 +24,6 @@ import (
"github.com/leptonai/gpud/pkg/nvidia-query/infiniband"
nvidia_query_nvml "github.com/leptonai/gpud/pkg/nvidia-query/nvml"
"github.com/leptonai/gpud/pkg/process"
query_log_common "github.com/leptonai/gpud/pkg/query/log/common"
query_log_tail "github.com/leptonai/gpud/pkg/query/log/tail"
"github.com/leptonai/gpud/pkg/sqlite"

"github.com/dustin/go-humanize"
Expand Down Expand Up @@ -227,52 +225,14 @@ func Scan(ctx context.Context, opts ...OpOption) error {
}

fmt.Printf("%s scanning dmesg for %d lines\n", inProgress, op.lines)
matched, err := query_log_tail.Scan(
ctx,
query_log_tail.WithDedup(true),
query_log_tail.WithCommands(pkg_dmesg.DefaultDmesgScanCommands),
query_log_tail.WithLinesToTail(op.lines),
query_log_tail.WithMatchFunc(
func(line string) (string, string) {
xidErr := nvidia_component_error_xid.Match(line)
if xidErr != nil {
return "xid found", ""
}
return "", "" // no match
},
func(line string) (string, string) {
sxidErr := nvidia_component_error_sxid.Match(line)
if sxidErr != nil {
return "sxid found", ""
}
return "", "" // no match
},
),
query_log_tail.WithExtractTime(func(l []byte) (time.Time, []byte, error) {
dm := pkg_dmesg.ParseDmesgLine(string(l))
return dm.Timestamp, l, nil
}),
query_log_tail.WithProcessMatched(func(time time.Time, line []byte, matched *query_log_common.Filter) {
if xidErr := nvidia_component_error_xid.Match(string(line)); xidErr != nil {
log.Logger.Warnw("known xid", "line", string(line))
yb, _ := xidErr.YAML()
fmt.Println(string(yb))
}

if sxidErr := nvidia_component_error_sxid.Match(string(line)); sxidErr != nil {
log.Logger.Warnw("known sxid", "line", string(line))
yb, _ := sxidErr.YAML()
fmt.Println(string(yb))
}
}),
)
issueCnt, err := scanDmesg(ctx)
if err != nil {
return err
}
if matched == 0 {
if issueCnt == 0 {
fmt.Printf("%s scanned dmesg file -- found no issue\n", checkMark)
} else {
fmt.Printf("%s scanned dmesg file -- found %d issue(s)\n", warningSign, matched)
fmt.Printf("%s scanned dmesg file -- found %d issue(s)\n", warningSign, issueCnt)
}
}

Expand Down Expand Up @@ -324,3 +284,59 @@ func Scan(ctx context.Context, opts ...OpOption) error {
fmt.Printf("\n\n%s scan complete\n\n", checkMark)
return nil
}

func scanDmesg(ctx context.Context) (int, error) {
p, err := process.New(
process.WithCommands(pkg_dmesg.DefaultDmesgScanCommands),
process.WithRunAsBashScript(),
)
if err != nil {
return 0, err
}
if err := p.Start(ctx); err != nil {
return 0, err
}
defer func() {
if err := p.Close(ctx); err != nil {
log.Logger.Warnw("failed to abort command", "err", err)
}
}()

nowUTC := time.Now().UTC()
issueCnt := 0
if err := process.Read(
ctx,
p,
process.WithReadStdout(),
process.WithReadStderr(),
process.WithProcessLine(func(line string) {
parsed := pkg_dmesg.ParseDmesgLine(line)
ts := humanize.RelTime(parsed.Timestamp, nowUTC, "ago", "from now")

if found := nvidia_xid.Match(line); found != nil {
fmt.Printf("[XID found] (%s) %q\n", ts, parsed.Content)
issueCnt++
return
}

if found := nvidia_sxid.Match(line); found != nil {
fmt.Printf("[SXID found] (%s) %q\n", ts, parsed.Content)
issueCnt++
return
}
}),
); err != nil {
return 0, err
}

select {
case <-ctx.Done():
return 0, ctx.Err()
case err := <-p.Wait():
if err != nil {
return 0, err
}
}

return issueCnt, nil
}
Loading

0 comments on commit 6064ecb

Please sign in to comment.