From 8af22e864ef1d8da69a5bcb40626b4f48c0bbf9e Mon Sep 17 00:00:00 2001 From: Gyuho Lee <6799218+gyuho@users.noreply.github.com> Date: Tue, 21 Jan 2025 13:25:31 +0800 Subject: [PATCH] fix(diagnose): fix "gpud scan" (add missing temp db creation) (#314) Signed-off-by: Gyuho Lee --- cmd/gpud/command/command.go | 6 ++ cmd/gpud/command/scan.go | 1 + components/diagnose/options.go | 8 +++ components/diagnose/scan.go | 96 ++++++++++++++++++-------------- components/diagnose/scan_test.go | 16 ++++++ e2e/e2e_test.go | 14 ++++- 6 files changed, 96 insertions(+), 45 deletions(-) create mode 100644 components/diagnose/scan_test.go diff --git a/cmd/gpud/command/command.go b/cmd/gpud/command/command.go index a493e06d..9ef937e4 100644 --- a/cmd/gpud/command/command.go +++ b/cmd/gpud/command/command.go @@ -42,6 +42,7 @@ var ( pollGPMEvents bool netcheck bool diskcheck bool + dmesgCheck bool enableAutoUpdate bool autoUpdateExitCode int @@ -536,6 +537,11 @@ cat summary.txt Usage: "enable disk checks (default: true)", Destination: &diskcheck, }, + &cli.BoolTFlag{ + Name: "dmesg-check", + Usage: "enable dmesg checks (default: true)", + Destination: &dmesgCheck, + }, }, }, { diff --git a/cmd/gpud/command/scan.go b/cmd/gpud/command/scan.go index 94b09d06..79ae598c 100644 --- a/cmd/gpud/command/scan.go +++ b/cmd/gpud/command/scan.go @@ -30,6 +30,7 @@ func cmdScan(cliContext *cli.Context) error { diagnose.WithPollGPMEvents(pollGPMEvents), diagnose.WithNetcheck(netcheck), diagnose.WithDiskcheck(diskcheck), + diagnose.WithDmesgCheck(dmesgCheck), } if zapLvl.Level() <= zap.DebugLevel { // e.g., info, warn, error diagnoseOpts = append(diagnoseOpts, diagnose.WithDebug(true)) diff --git a/components/diagnose/options.go b/components/diagnose/options.go index 69326bf3..d365d9e5 100644 --- a/components/diagnose/options.go +++ b/components/diagnose/options.go @@ -10,6 +10,8 @@ type Op struct { netcheck bool diskcheck bool + + dmesgCheck bool } type OpOption func(*Op) @@ -66,3 +68,9 @@ func WithDiskcheck(b bool) OpOption { op.diskcheck = b } } + +func WithDmesgCheck(b bool) OpOption { + return func(op *Op) { + op.dmesgCheck = b + } +} diff --git a/components/diagnose/scan.go b/components/diagnose/scan.go index 1484fbd1..1733ba0b 100644 --- a/components/diagnose/scan.go +++ b/components/diagnose/scan.go @@ -10,6 +10,7 @@ import ( "github.com/dustin/go-humanize" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" + nvidia_clock_events_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/clock-events-state" nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml" nvidia_query_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/query/sxid" nvidia_query_xid "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid" @@ -41,10 +42,6 @@ var defaultNVIDIALibraries = []string{ // Runs the scan operations. func Scan(ctx context.Context, opts ...OpOption) error { - if os.Geteuid() != 0 { - return errors.New("requires sudo/root access in order to scan dmesg errors") - } - op := &Op{} if err := op.applyOpts(opts); err != nil { return err @@ -126,6 +123,13 @@ func Scan(ctx context.Context, opts ...OpOption) error { } defer db.Close() + // "nvidia_query.Get" assumes that the "clock-events-state" table exists + // pre-create since this is a one-off operation + // TODO: move these into a single place + if err := nvidia_clock_events_state.CreateTable(ctx, db); err != nil { + log.Logger.Fatalw("failed to create clock events state table", "error", err) + } + outputRaw, err := nvidia_query.Get(ctx, db, db) if err != nil { log.Logger.Warnw("error getting nvidia info", "error", err) @@ -214,48 +218,54 @@ func Scan(ctx context.Context, opts ...OpOption) error { } println() - fmt.Printf("%s scanning dmesg for %d lines\n", inProgress, op.lines) - defaultDmesgCfg, err := dmesg.DefaultConfig(ctx) - if err != nil { - return err - } - matched, err := query_log_tail.Scan( - ctx, - query_log_tail.WithDedup(true), - query_log_tail.WithCommands(defaultDmesgCfg.Log.Scan.Commands), - query_log_tail.WithLinesToTail(op.lines), - query_log_tail.WithSelectFilter(defaultDmesgCfg.Log.SelectFilters...), - query_log_tail.WithExtractTime(defaultDmesgCfg.Log.TimeParseFunc), - query_log_tail.WithProcessMatched(func(time time.Time, line []byte, matched *query_log_common.Filter) { - log.Logger.Debugw("matched", "line", string(line)) - fmt.Println("line", string(line)) - matchedB, _ := matched.YAML() - fmt.Println(string(matchedB)) - - if xid := nvidia_query_xid.ExtractNVRMXid(string(line)); xid > 0 { - if dm, err := nvidia_query_xid.ParseDmesgLogLine(metav1.Time{Time: time}, string(line)); err == nil { - log.Logger.Warnw("known xid", "line", string(line)) - yb, _ := dm.YAML() - fmt.Println(string(yb)) + if op.dmesgCheck { + if os.Geteuid() != 0 { + return errors.New("requires sudo/root access in order to scan dmesg errors") + } + + fmt.Printf("%s scanning dmesg for %d lines\n", inProgress, op.lines) + defaultDmesgCfg, err := dmesg.DefaultConfig(ctx) + if err != nil { + return err + } + matched, err := query_log_tail.Scan( + ctx, + query_log_tail.WithDedup(true), + query_log_tail.WithCommands(defaultDmesgCfg.Log.Scan.Commands), + query_log_tail.WithLinesToTail(op.lines), + query_log_tail.WithSelectFilter(defaultDmesgCfg.Log.SelectFilters...), + query_log_tail.WithExtractTime(defaultDmesgCfg.Log.TimeParseFunc), + query_log_tail.WithProcessMatched(func(time time.Time, line []byte, matched *query_log_common.Filter) { + log.Logger.Debugw("matched", "line", string(line)) + fmt.Println("line", string(line)) + matchedB, _ := matched.YAML() + fmt.Println(string(matchedB)) + + if xid := nvidia_query_xid.ExtractNVRMXid(string(line)); xid > 0 { + if dm, err := nvidia_query_xid.ParseDmesgLogLine(metav1.Time{Time: time}, string(line)); err == nil { + log.Logger.Warnw("known xid", "line", string(line)) + yb, _ := dm.YAML() + fmt.Println(string(yb)) + } } - } - if sxid := nvidia_query_sxid.ExtractNVSwitchSXid(string(line)); sxid > 0 { - if dm, err := nvidia_query_sxid.ParseDmesgLogLine(metav1.Time{Time: time}, string(line)); err == nil { - log.Logger.Warnw("known sxid", "line", string(line)) - yb, _ := dm.YAML() - fmt.Println(string(yb)) + if sxid := nvidia_query_sxid.ExtractNVSwitchSXid(string(line)); sxid > 0 { + if dm, err := nvidia_query_sxid.ParseDmesgLogLine(metav1.Time{Time: time}, string(line)); err == nil { + log.Logger.Warnw("known sxid", "line", string(line)) + yb, _ := dm.YAML() + fmt.Println(string(yb)) + } } - } - }), - ) - if err != nil { - return err - } - if matched == 0 { - fmt.Printf("%s scanned dmesg file -- found no issue\n", checkMark) - } else { - fmt.Printf("%s scanned dmesg file -- found %d issue(s)\n", warningSign, matched) + }), + ) + if err != nil { + return err + } + if matched == 0 { + fmt.Printf("%s scanned dmesg file -- found no issue\n", checkMark) + } else { + fmt.Printf("%s scanned dmesg file -- found %d issue(s)\n", warningSign, matched) + } } if op.netcheck { diff --git a/components/diagnose/scan_test.go b/components/diagnose/scan_test.go new file mode 100644 index 00000000..d8b0fede --- /dev/null +++ b/components/diagnose/scan_test.go @@ -0,0 +1,16 @@ +package diagnose + +import ( + "context" + "testing" + "time" +) + +func TestScan(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) + defer cancel() + + if err := Scan(ctx); err != nil { + t.Fatalf("error scanning: %+v", err) + } +} diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 3aac54f8..37b2500b 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -28,6 +28,16 @@ func TestGpudHealthzInfo(t *testing.T) { t.Skip("skipping e2e tests") } + // start gpud scan + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + cmd := exec.CommandContext(ctx, os.Getenv("GPUD_BIN"), "scan", "--dmesg-check=false") + b, err := cmd.CombinedOutput() + cancel() + if err != nil { + t.Fatalf("failed to run gpud scan: %v\n%s", err, string(b)) + } + t.Logf("gpud scan output:\n%s", string(b)) + // get an available port listener, err := net.Listen("tcp", "localhost:0") if err != nil { @@ -42,10 +52,10 @@ func TestGpudHealthzInfo(t *testing.T) { randVal := randStr(t, 10) // start gpud command - ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + ctx, cancel = context.WithTimeout(context.Background(), time.Minute) defer cancel() - cmd := exec.CommandContext(ctx, os.Getenv("GPUD_BIN"), "run", "--log-level=debug", "--web-enable=false", "--enable-auto-update=false", "--annotations", fmt.Sprintf("{%q:%q}", randKey, randVal), fmt.Sprintf("--listen-address=%s", ep)) + cmd = exec.CommandContext(ctx, os.Getenv("GPUD_BIN"), "run", "--log-level=debug", "--web-enable=false", "--enable-auto-update=false", "--annotations", fmt.Sprintf("{%q:%q}", randKey, randVal), fmt.Sprintf("--listen-address=%s", ep)) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr