Merge #117296

117296: roachtest: don't connect workload to stalling node r=nicktrav a=itsbilal Previously, we'd connect the workload binary to every node including the node we'd induce a stall on. This led to an issue where 1/3 of our workload workers would just not be able to connect, resulting in at least a 1/3rd drop in QPS, making the test as a whole flaky. This change updates the workload call to only connect to nodes 2 and 3, which are expected to stay up throughout the test. This is in line with user expectation; a stalling node is expected to crash while connections to other nodes are expected to keep working. Fixes #116631. Epic: none Release note: None Co-authored-by: Bilal Akhtar <[email protected]>
cockroachdb · Jan 4, 2024 · f2917d8 · f2917d8
2 parents e4067f1 + 52a0d1d
commit f2917d8
Showing 1 changed file with 4 additions and 1 deletion.
diff --git a/pkg/cmd/roachtest/tests/disk_stall.go b/pkg/cmd/roachtest/tests/disk_stall.go
@@ -129,10 +129,13 @@ func runDiskStalledDetection(
 	workloadStartAt := timeutil.Now()
 	m := c.NewMonitor(ctx, c.Range(1, 3))
 	m.Go(func(ctx context.Context) error {
+		// NB: Since we stall node 1, we run the workload only on nodes 2-3 so
+		// the post-stall QPS isn't affected by the fact that 1/3rd of workload
+		// workers just can't connect to a working node.
 		c.Run(ctx, c.Node(4), `./cockroach workload run kv --read-percent 50 `+
 			`--duration 10m --concurrency 256 --max-rate 2048 --tolerate-errors `+
 			` --min-block-bytes=512 --max-block-bytes=512 `+
-			`{pgurl:1-3}`)
+			`{pgurl:2-3}`)
 		return nil
 	})
 	defer m.Wait()