Skip to content

Commit

Permalink
fix(diagnose): fix "gpud scan" (add missing temp db creation) (#314)
Browse files Browse the repository at this point in the history
Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho authored Jan 21, 2025
1 parent a53a3db commit 8af22e8
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 45 deletions.
6 changes: 6 additions & 0 deletions cmd/gpud/command/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ var (
pollGPMEvents bool
netcheck bool
diskcheck bool
dmesgCheck bool

enableAutoUpdate bool
autoUpdateExitCode int
Expand Down Expand Up @@ -536,6 +537,11 @@ cat summary.txt
Usage: "enable disk checks (default: true)",
Destination: &diskcheck,
},
&cli.BoolTFlag{
Name: "dmesg-check",
Usage: "enable dmesg checks (default: true)",
Destination: &dmesgCheck,
},
},
},
{
Expand Down
1 change: 1 addition & 0 deletions cmd/gpud/command/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ func cmdScan(cliContext *cli.Context) error {
diagnose.WithPollGPMEvents(pollGPMEvents),
diagnose.WithNetcheck(netcheck),
diagnose.WithDiskcheck(diskcheck),
diagnose.WithDmesgCheck(dmesgCheck),
}
if zapLvl.Level() <= zap.DebugLevel { // e.g., info, warn, error
diagnoseOpts = append(diagnoseOpts, diagnose.WithDebug(true))
Expand Down
8 changes: 8 additions & 0 deletions components/diagnose/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ type Op struct {

netcheck bool
diskcheck bool

dmesgCheck bool
}

type OpOption func(*Op)
Expand Down Expand Up @@ -66,3 +68,9 @@ func WithDiskcheck(b bool) OpOption {
op.diskcheck = b
}
}

func WithDmesgCheck(b bool) OpOption {
return func(op *Op) {
op.dmesgCheck = b
}
}
96 changes: 53 additions & 43 deletions components/diagnose/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/dustin/go-humanize"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
nvidia_clock_events_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/clock-events-state"
nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml"
nvidia_query_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/query/sxid"
nvidia_query_xid "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid"
Expand Down Expand Up @@ -41,10 +42,6 @@ var defaultNVIDIALibraries = []string{

// Runs the scan operations.
func Scan(ctx context.Context, opts ...OpOption) error {
if os.Geteuid() != 0 {
return errors.New("requires sudo/root access in order to scan dmesg errors")
}

op := &Op{}
if err := op.applyOpts(opts); err != nil {
return err
Expand Down Expand Up @@ -126,6 +123,13 @@ func Scan(ctx context.Context, opts ...OpOption) error {
}
defer db.Close()

// "nvidia_query.Get" assumes that the "clock-events-state" table exists
// pre-create since this is a one-off operation
// TODO: move these into a single place
if err := nvidia_clock_events_state.CreateTable(ctx, db); err != nil {
log.Logger.Fatalw("failed to create clock events state table", "error", err)
}

outputRaw, err := nvidia_query.Get(ctx, db, db)
if err != nil {
log.Logger.Warnw("error getting nvidia info", "error", err)
Expand Down Expand Up @@ -214,48 +218,54 @@ func Scan(ctx context.Context, opts ...OpOption) error {
}
println()

fmt.Printf("%s scanning dmesg for %d lines\n", inProgress, op.lines)
defaultDmesgCfg, err := dmesg.DefaultConfig(ctx)
if err != nil {
return err
}
matched, err := query_log_tail.Scan(
ctx,
query_log_tail.WithDedup(true),
query_log_tail.WithCommands(defaultDmesgCfg.Log.Scan.Commands),
query_log_tail.WithLinesToTail(op.lines),
query_log_tail.WithSelectFilter(defaultDmesgCfg.Log.SelectFilters...),
query_log_tail.WithExtractTime(defaultDmesgCfg.Log.TimeParseFunc),
query_log_tail.WithProcessMatched(func(time time.Time, line []byte, matched *query_log_common.Filter) {
log.Logger.Debugw("matched", "line", string(line))
fmt.Println("line", string(line))
matchedB, _ := matched.YAML()
fmt.Println(string(matchedB))

if xid := nvidia_query_xid.ExtractNVRMXid(string(line)); xid > 0 {
if dm, err := nvidia_query_xid.ParseDmesgLogLine(metav1.Time{Time: time}, string(line)); err == nil {
log.Logger.Warnw("known xid", "line", string(line))
yb, _ := dm.YAML()
fmt.Println(string(yb))
if op.dmesgCheck {
if os.Geteuid() != 0 {
return errors.New("requires sudo/root access in order to scan dmesg errors")
}

fmt.Printf("%s scanning dmesg for %d lines\n", inProgress, op.lines)
defaultDmesgCfg, err := dmesg.DefaultConfig(ctx)
if err != nil {
return err
}
matched, err := query_log_tail.Scan(
ctx,
query_log_tail.WithDedup(true),
query_log_tail.WithCommands(defaultDmesgCfg.Log.Scan.Commands),
query_log_tail.WithLinesToTail(op.lines),
query_log_tail.WithSelectFilter(defaultDmesgCfg.Log.SelectFilters...),
query_log_tail.WithExtractTime(defaultDmesgCfg.Log.TimeParseFunc),
query_log_tail.WithProcessMatched(func(time time.Time, line []byte, matched *query_log_common.Filter) {
log.Logger.Debugw("matched", "line", string(line))
fmt.Println("line", string(line))
matchedB, _ := matched.YAML()
fmt.Println(string(matchedB))

if xid := nvidia_query_xid.ExtractNVRMXid(string(line)); xid > 0 {
if dm, err := nvidia_query_xid.ParseDmesgLogLine(metav1.Time{Time: time}, string(line)); err == nil {
log.Logger.Warnw("known xid", "line", string(line))
yb, _ := dm.YAML()
fmt.Println(string(yb))
}
}
}

if sxid := nvidia_query_sxid.ExtractNVSwitchSXid(string(line)); sxid > 0 {
if dm, err := nvidia_query_sxid.ParseDmesgLogLine(metav1.Time{Time: time}, string(line)); err == nil {
log.Logger.Warnw("known sxid", "line", string(line))
yb, _ := dm.YAML()
fmt.Println(string(yb))
if sxid := nvidia_query_sxid.ExtractNVSwitchSXid(string(line)); sxid > 0 {
if dm, err := nvidia_query_sxid.ParseDmesgLogLine(metav1.Time{Time: time}, string(line)); err == nil {
log.Logger.Warnw("known sxid", "line", string(line))
yb, _ := dm.YAML()
fmt.Println(string(yb))
}
}
}
}),
)
if err != nil {
return err
}
if matched == 0 {
fmt.Printf("%s scanned dmesg file -- found no issue\n", checkMark)
} else {
fmt.Printf("%s scanned dmesg file -- found %d issue(s)\n", warningSign, matched)
}),
)
if err != nil {
return err
}
if matched == 0 {
fmt.Printf("%s scanned dmesg file -- found no issue\n", checkMark)
} else {
fmt.Printf("%s scanned dmesg file -- found %d issue(s)\n", warningSign, matched)
}
}

if op.netcheck {
Expand Down
16 changes: 16 additions & 0 deletions components/diagnose/scan_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package diagnose

import (
"context"
"testing"
"time"
)

func TestScan(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
defer cancel()

if err := Scan(ctx); err != nil {
t.Fatalf("error scanning: %+v", err)
}
}
14 changes: 12 additions & 2 deletions e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@ func TestGpudHealthzInfo(t *testing.T) {
t.Skip("skipping e2e tests")
}

// start gpud scan
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
cmd := exec.CommandContext(ctx, os.Getenv("GPUD_BIN"), "scan", "--dmesg-check=false")
b, err := cmd.CombinedOutput()
cancel()
if err != nil {
t.Fatalf("failed to run gpud scan: %v\n%s", err, string(b))
}
t.Logf("gpud scan output:\n%s", string(b))

// get an available port
listener, err := net.Listen("tcp", "localhost:0")
if err != nil {
Expand All @@ -42,10 +52,10 @@ func TestGpudHealthzInfo(t *testing.T) {
randVal := randStr(t, 10)

// start gpud command
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
ctx, cancel = context.WithTimeout(context.Background(), time.Minute)
defer cancel()

cmd := exec.CommandContext(ctx, os.Getenv("GPUD_BIN"), "run", "--log-level=debug", "--web-enable=false", "--enable-auto-update=false", "--annotations", fmt.Sprintf("{%q:%q}", randKey, randVal), fmt.Sprintf("--listen-address=%s", ep))
cmd = exec.CommandContext(ctx, os.Getenv("GPUD_BIN"), "run", "--log-level=debug", "--web-enable=false", "--enable-auto-update=false", "--annotations", fmt.Sprintf("{%q:%q}", randKey, randVal), fmt.Sprintf("--listen-address=%s", ep))
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr

Expand Down

0 comments on commit 8af22e8

Please sign in to comment.