Run cluster status monitor on unsharded controller only

Running one cluster status monitor per Fleet controller pod is not necessary and may cause conflicts in sharded setups.
rancher · Oct 22, 2024 · 60d35cf · 60d35cf
1 parent c4d94bd
commit 60d35cf
Showing 1 changed file with 11 additions and 9 deletions.
diff --git a/internal/cmd/controller/operator.go b/internal/cmd/controller/operator.go
@@ -172,16 +172,18 @@ func start(
 		return err
 	}
 
-	setupLog.Info("starting cluster status monitor")
-	cfg := fleetcfg.Get()
-	// No need to run a similar check on the threshold, since its minimum value will be a multiple of the agent check-in
-	// interval anyway.
-	if cfg.ClusterMonitorInterval.Seconds() == 0 {
-		err := errors.New("cluster status monitor interval cannot be 0")
-		setupLog.Error(err, "cannot start cluster status monitor")
-		return err
+	if shardID == "" { // only one instance of the cluster status monitor needs to run.
+		setupLog.Info("starting cluster status monitor")
+		cfg := fleetcfg.Get()
+		// No need to run a similar check on the threshold, since its minimum value will be a multiple of the agent check-in
+		// interval anyway.
+		if cfg.ClusterMonitorInterval.Seconds() == 0 {
+			err := errors.New("cluster status monitor interval cannot be 0")
+			setupLog.Error(err, "cannot start cluster status monitor")
+			return err
+		}
+		go clustermonitor.Run(ctx, mgr.GetClient(), cfg.ClusterMonitorInterval.Duration, cfg.ClusterMonitorThreshold.Duration)
 	}
-	go clustermonitor.Run(ctx, mgr.GetClient(), cfg.ClusterMonitorInterval.Duration, cfg.ClusterMonitorThreshold.Duration)
 
 	setupLog.Info("starting job scheduler")
 	jobCtx, cancel := context.WithCancel(ctx)