diff --git a/charts/fleet/templates/configmap.yaml b/charts/fleet/templates/configmap.yaml index 8f0a5aff33..875c1505e3 100644 --- a/charts/fleet/templates/configmap.yaml +++ b/charts/fleet/templates/configmap.yaml @@ -16,6 +16,8 @@ data: "bundledeployment": "{{.Values.agent.reconciler.workers.bundledeployment}}", "drift": "{{.Values.agent.reconciler.workers.drift}}" }, + "clusterMonitorInterval": "{{.Values.clusterMonitorInterval}}", + "clusterMonitorThreshold": "{{.Values.clusterMonitorThreshold}}", {{ if .Values.garbageCollectionInterval }} "garbageCollectionInterval": "{{.Values.garbageCollectionInterval}}", {{ end }} diff --git a/charts/fleet/values.yaml b/charts/fleet/values.yaml index 7094877a47..4a1e1c2e0e 100644 --- a/charts/fleet/values.yaml +++ b/charts/fleet/values.yaml @@ -23,6 +23,15 @@ agentTLSMode: "system-store" # A duration string for how often agents should report a heartbeat agentCheckinInterval: "15m" +# Determines how long must have elapsed since a downstream cluster's Fleet agent last reported its status to the +# management cluster, before that downstream cluster is considered offline. +# If this configured value is shorter than three times the agent check-in interval, then that check-in +# interval-based value will be used instead to prevent false positives. +clusterMonitorThreshold: "45m" + +# Determines how often the cluster monitor will check for offline downstream clusters. +clusterMonitorInterval: "10m" + # The amount of time that agents will wait before they clean up old Helm releases. # A non-existent value or 0 will result in an interval of 15 minutes. garbageCollectionInterval: "15m" diff --git a/e2e/multi-cluster/installation/agent_test.go b/e2e/multi-cluster/installation/agent_test.go index 5bea51fdd2..d9aca41365 100644 --- a/e2e/multi-cluster/installation/agent_test.go +++ b/e2e/multi-cluster/installation/agent_test.go @@ -27,8 +27,9 @@ var _ = Describe("Fleet installation with TLS agent modes", func() { "cattle-fleet-system", "--type=merge", "-p", + // Agent check-in interval cannot be 0. Any other value will do here. fmt.Sprintf( - `{"data":{"config":"{\"apiServerURL\": \"https://google.com\", \"apiServerCA\": \"\", \"agentTLSMode\": \"%s\"}"}}`, + `{"data":{"config":"{\"apiServerURL\": \"https://google.com\", \"apiServerCA\": \"\", \"agentTLSMode\": \"%s\", \"agentCheckinInterval\": \"1m\"}"}}`, agentMode, ), ) diff --git a/internal/cmd/agent/deployer/monitor/condition.go b/internal/cmd/agent/deployer/monitor/condition.go index 87c3e69ef2..e085a89b7c 100644 --- a/internal/cmd/agent/deployer/monitor/condition.go +++ b/internal/cmd/agent/deployer/monitor/condition.go @@ -44,6 +44,14 @@ func (c Cond) IsFalse(obj interface{}) bool { return getStatus(obj, string(c)) == "False" } +func (c Cond) Unknown(obj interface{}) { + setStatus(obj, string(c), "Unknown") +} + +func (c Cond) IsUnknown(obj interface{}) bool { + return getStatus(obj, string(c)) == "Unknown" +} + func (c Cond) Reason(obj interface{}, reason string) { cond := findOrCreateCond(obj, string(c)) getFieldValue(cond, "Reason").SetString(reason) diff --git a/internal/cmd/controller/agentmanagement/controllers/cluster/import.go b/internal/cmd/controller/agentmanagement/controllers/cluster/import.go index 9541683ebe..c7c13a97ae 100644 --- a/internal/cmd/controller/agentmanagement/controllers/cluster/import.go +++ b/internal/cmd/controller/agentmanagement/controllers/cluster/import.go @@ -236,6 +236,10 @@ func (i *importHandler) importCluster(cluster *fleet.Cluster, status fleet.Clust apiServerCA = secret.Data[config.APIServerCAKey] ) + if cfg.AgentCheckinInterval.Seconds() == 0 { + return status, fmt.Errorf("agent check-in interval cannot be 0") + } + if apiServerURL == "" { if len(cfg.APIServerURL) == 0 { return status, fmt.Errorf("missing apiServerURL in fleet config for cluster auto registration") diff --git a/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent.go b/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent.go index f180761844..857ebb3ed0 100644 --- a/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent.go +++ b/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent.go @@ -256,6 +256,10 @@ func (h *handler) newAgentBundle(ns string, cluster *fleet.Cluster) (runtime.Obj agentNamespace = cluster.Spec.AgentNamespace } + if cfg.AgentCheckinInterval.Seconds() == 0 { + return nil, fmt.Errorf("agent check-in interval cannot be 0") + } + // Notice we only set the agentScope when it's a non-default agentNamespace. This is for backwards compatibility // for when we didn't have agent scope before objs := agent.Manifest( diff --git a/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent_test.go b/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent_test.go index 946f9ec47c..b3bee2d814 100644 --- a/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent_test.go +++ b/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent_test.go @@ -1,17 +1,37 @@ package manageagent import ( + "strings" "testing" + "time" "github.com/rancher/wrangler/v3/pkg/generic/fake" "go.uber.org/mock/gomock" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + "github.com/rancher/fleet/internal/config" fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" ) +func TestNewAgentBundle(t *testing.T) { + config.Set(&config.Config{AgentCheckinInterval: metav1.Duration{Duration: 0 * time.Second}}) + + h := handler{systemNamespace: "blah"} + obj, err := h.newAgentBundle("foo", &fleet.Cluster{Spec: fleet.ClusterSpec{AgentNamespace: "bar"}}) + + if obj != nil { + t.Fatalf("expected obj returned by newAgentBundle to be nil") + } + + expectedStr := "interval cannot be 0" + if !strings.Contains(err.Error(), expectedStr) { + t.Fatalf("expected error returned by newAgentBundle to contain %q", expectedStr) + } +} + func TestOnClusterChangeAffinity(t *testing.T) { ctrl := gomock.NewController(t) namespaces := fake.NewMockNonNamespacedControllerInterface[*corev1.Namespace, *corev1.NamespaceList](ctrl) diff --git a/internal/cmd/controller/clustermonitor/monitor.go b/internal/cmd/controller/clustermonitor/monitor.go new file mode 100644 index 0000000000..86b8b15cc2 --- /dev/null +++ b/internal/cmd/controller/clustermonitor/monitor.go @@ -0,0 +1,145 @@ +package clustermonitor + +import ( + "context" + "errors" + "math" + "time" + + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/rancher/fleet/internal/cmd/agent/deployer/monitor" + "github.com/rancher/fleet/internal/config" + "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" +) + +const offlineMsg = "cluster is offline" + +// Run monitors Fleet cluster resources' agent last seen dates. If a cluster's agent was last seen longer ago than +// a certain threshold, then Run updates statuses of all bundle deployments targeting that cluster, to reflect the fact +// that the cluster is offline. This prevents those bundle deployments from displaying outdated status information. +// +// A cluster will be considered offline if its Fleet agent has not reported its status for more than: +// - three times the agent check-in interval +// - or any larger configured interval. +// Therefore, this function requires configuration to have been loaded into the config package using `Load` before +// running. +// +// Bundle deployment status updates done here are unlikely to conflict with those done by the bundle deployment +// reconciler, which are either run from an online target cluster (from its Fleet agent) or triggered by other status +// updates such as this one (eg. bundle deployment reconciler living in the Fleet controller). +func Run(ctx context.Context, c client.Client, interval, threshold time.Duration) { + for { + select { + case <-ctx.Done(): + return + case <-time.After(interval): + } + + cfg := config.Get() // This enables config changes to take effect + + thresholdSecs := math.Max(cfg.AgentCheckinInterval.Seconds()*3, threshold.Seconds()) + + UpdateOfflineBundleDeployments(ctx, c, time.Second*time.Duration(thresholdSecs)) + } +} + +// UpdateOfflineBundleDeployments looks for offline clusters based on the provided threshold duration. For each cluster +// considered offline, this updates its bundle deployments' statuses accordingly. +// If a cluster's bundle deployments have already been marked as offline, they will be skipped. +func UpdateOfflineBundleDeployments(ctx context.Context, c client.Client, threshold time.Duration) { + logger := ctrl.Log.WithName("cluster status monitor") + + clusters := &v1alpha1.ClusterList{} + if err := c.List(ctx, clusters); err != nil { + logger.Error(err, "Failed to get list of clusters") + return + } + + for _, cluster := range clusters.Items { + lastSeen := cluster.Status.Agent.LastSeen + + logger.Info("Checking cluster status", "cluster", cluster.Name, "last seen", lastSeen.UTC().String()) + + // lastSeen being 0 would typically mean that the cluster is not registered yet, in which case bundle + // deployments should not be deployed there. + if lastSeen.IsZero() || time.Now().UTC().Sub(lastSeen.UTC()) < threshold { + continue + } + + logger.Info("Detected offline cluster", "cluster", cluster.Name) + + // Cluster is offline + bundleDeployments := &v1alpha1.BundleDeploymentList{} + if err := c.List(ctx, bundleDeployments, client.InNamespace(cluster.Status.Namespace)); err != nil { + logger.Error( + err, + "Failed to get list of bundle deployments for offline cluster", + "cluster", + cluster.Name, + "namespace", + cluster.Status.Namespace, + ) + continue + } + + bd_update: + for _, bd := range bundleDeployments.Items { + for _, cond := range bd.Status.Conditions { + switch cond.Type { + case "Ready": + fallthrough + case "Monitored": + if cond.Message == offlineMsg { + break bd_update + } + } + } + + logger.Info("Updating bundle deployment in offline cluster", "cluster", cluster.Name, "bundledeployment", bd.Name) + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + t := &v1alpha1.BundleDeployment{} + nsn := types.NamespacedName{Name: bd.Name, Namespace: bd.Namespace} + if err := c.Get(ctx, nsn, t); err != nil { + return err + } + t.Status = bd.Status + // Any information about resources living in an offline cluster is likely to be + // outdated. + t.Status.ModifiedStatus = nil + t.Status.NonReadyStatus = nil + + for _, cond := range bd.Status.Conditions { + switch cond.Type { + case "Ready": + mc := monitor.Cond(v1alpha1.BundleDeploymentConditionReady) + mc.SetError(&t.Status, "Cluster offline", errors.New(offlineMsg)) + mc.Unknown(&t.Status) + // XXX: do we want to set Deployed and Installed conditions as well? + case "Monitored": + mc := monitor.Cond(v1alpha1.BundleDeploymentConditionMonitored) + mc.SetError(&t.Status, "Cluster offline", errors.New(offlineMsg)) + + } + } + + return c.Status().Update(ctx, t) + }) + if err != nil { + logger.Error( + err, + "Failed to update bundle deployment status for offline cluster", + "bundledeployment", + bd.Name, + "cluster", + cluster.Name, + "namespace", + cluster.Status.Namespace, + ) + } + } + } +} diff --git a/internal/cmd/controller/clustermonitor/monitor_test.go b/internal/cmd/controller/clustermonitor/monitor_test.go new file mode 100644 index 0000000000..adb3bd412f --- /dev/null +++ b/internal/cmd/controller/clustermonitor/monitor_test.go @@ -0,0 +1,524 @@ +package clustermonitor_test + +import ( + "context" + "strings" + "testing" + "time" + + "go.uber.org/mock/gomock" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/rancher/fleet/internal/cmd/controller/clustermonitor" + "github.com/rancher/fleet/internal/mocks" + "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "github.com/rancher/wrangler/v3/pkg/genericcondition" +) + +const threshold = 1 * time.Second + +// BDStatusMatcher implements a gomock matcher for bundle deployment status. +// The matcher checks for empty modified and non-ready status, as well as offline state through Ready and Monitored +// conditions. +type BDStatusMatcher struct { +} + +func (m BDStatusMatcher) Matches(x interface{}) bool { + bd, ok := x.(*v1alpha1.BundleDeployment) + if !ok || bd == nil { + return false + } + + bdStatus := bd.Status + + if bdStatus.ModifiedStatus != nil || bdStatus.NonReadyStatus != nil { + return false + } + + foundReady, foundMonitored := false, false + for _, cond := range bdStatus.Conditions { + if cond.Type == "Ready" { + foundReady = true + + if cond.Status != "Unknown" || + !strings.Contains(cond.Reason, "offline") || + !strings.Contains(cond.Message, "offline") { + return false + } + } else if cond.Type == "Monitored" { + foundMonitored = true + + if cond.Status != "False" || + !strings.Contains(cond.Reason, "offline") || + !strings.Contains(cond.Message, "offline") { + return false + } + } + + } + + return foundReady && foundMonitored +} + +func (m BDStatusMatcher) String() string { + return "Bundle deployment status for offline cluster" +} + +type clusterWithOfflineMarker struct { + cluster v1alpha1.Cluster + isOffline bool + isAlreadyMarkedOffline bool +} + +func Test_Run(t *testing.T) { //nolint: funlen // this is a test function, its length should not be an issue + cases := []struct { + name string + clusters []clusterWithOfflineMarker + listClustersErr error + bundleDeployments map[string][]v1alpha1.BundleDeployment // indexed by cluster namespace + listBDErr error + }{ + { + name: "no cluster", + clusters: nil, + listClustersErr: nil, + bundleDeployments: nil, + listBDErr: nil, + }, + { + name: "no offline cluster", + clusters: []clusterWithOfflineMarker{ + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + // eg. not yet registered downstream cluster + Agent: v1alpha1.AgentStatus{ /* LastSeen is zero */ }, + }, + }, + }, + }, + listClustersErr: nil, + bundleDeployments: nil, + listBDErr: nil, + }, + { + name: "one offline cluster", + clusters: []clusterWithOfflineMarker{ + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-10 * threshold)}, + }, + Namespace: "clusterns1", + }, + }, + isOffline: true, + }, + }, + listClustersErr: nil, + bundleDeployments: map[string][]v1alpha1.BundleDeployment{ + "clusterns1": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns1", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + }, + listBDErr: nil, + }, + { + name: "multiple offline clusters", + clusters: []clusterWithOfflineMarker{ + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-10 * threshold)}, + }, + Namespace: "clusterns1", + }, + }, + isOffline: true, + }, + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster2", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-5 * threshold)}, + }, + Namespace: "clusterns2", + }, + }, + isOffline: true, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster3", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-3 * threshold)}, + }, + Namespace: "clusterns3", + }, + }, + isOffline: true, + }, + }, + listClustersErr: nil, + bundleDeployments: map[string][]v1alpha1.BundleDeployment{ + "clusterns1": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns1", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + "clusterns2": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns2", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "my-other-bundledeployment", + Namespace: "clusterns2", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + "clusterns3": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns3", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + }, + listBDErr: nil, + }, + { + name: "multiple offline clusters, some of them already marked offline", + clusters: []clusterWithOfflineMarker{ + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-10 * threshold)}, + }, + Namespace: "clusterns1", + }, + }, + isOffline: true, + }, + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster2", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-5 * threshold)}, + }, + Namespace: "clusterns2", + }, + }, + isOffline: true, + isAlreadyMarkedOffline: true, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster3", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-3 * threshold)}, + }, + Namespace: "clusterns3", + }, + }, + isOffline: true, + }, + }, + listClustersErr: nil, + bundleDeployments: map[string][]v1alpha1.BundleDeployment{ + "clusterns1": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns1", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + "clusterns2": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns2", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + Message: "cluster is offline", + }, + { + Type: "Monitored", + Message: "cluster is offline", + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "my-other-bundledeployment", + Namespace: "clusterns2", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + Message: "cluster is offline", + }, + { + Type: "Monitored", + Message: "cluster is offline", + }, + }, + }, + }, + }, + "clusterns3": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns3", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + }, + listBDErr: nil, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + testClient := mocks.NewMockClient(ctrl) + ctx, cancel := context.WithCancel(context.Background()) + + defer cancel() + + testClient.EXPECT().List(ctx, gomock.Any(), gomock.Any()).DoAndReturn( + func(ctx context.Context, list *v1alpha1.ClusterList, opts ...client.ListOption) error { + for _, cl := range tc.clusters { + list.Items = append(list.Items, cl.cluster) + } + + return tc.listClustersErr + }, + ) + + for _, cl := range tc.clusters { + if !cl.isOffline { + continue + } + + bundleDeplsForCluster := tc.bundleDeployments[cl.cluster.Status.Namespace] + testClient.EXPECT().List(ctx, gomock.Any(), gomock.Any()).DoAndReturn( + func(ctx context.Context, list *v1alpha1.BundleDeploymentList, opts ...client.ListOption) error { + list.Items = append(list.Items, bundleDeplsForCluster...) + + return tc.listBDErr + }, + ) + + if cl.isAlreadyMarkedOffline { + // skip status updates for bundle deployments which have already been + // marked offline by a previous monitoring loop. + continue + } + + for _, bd := range bundleDeplsForCluster { + testClient.EXPECT().Get(ctx, gomock.Any(), gomock.Any()).DoAndReturn( + func( + ctx context.Context, + nsn types.NamespacedName, + // b's initial value is never used, but the variable needs to be + // named for its value to be overwritten with values we care + // about, to simulate a response from the API server. + b *v1alpha1.BundleDeployment, //nolint: staticcheck + opts ...client.GetOption, + ) error { + b = &bd //nolint: ineffassign,staticcheck // the value is used by the implementation, not directly by the tests. + + return nil + }, + ) + + srw := mocks.NewMockSubResourceWriter(ctrl) + testClient.EXPECT().Status().Return(srw) + + srw.EXPECT().Update(ctx, &BDStatusMatcher{}).Return(nil) + } + } + + clustermonitor.UpdateOfflineBundleDeployments(ctx, testClient, threshold) + }) + } +} diff --git a/internal/cmd/controller/operator.go b/internal/cmd/controller/operator.go index 73881c62a9..084db3ee98 100644 --- a/internal/cmd/controller/operator.go +++ b/internal/cmd/controller/operator.go @@ -2,13 +2,16 @@ package controller import ( "context" + "errors" "fmt" "github.com/reugn/go-quartz/quartz" "github.com/rancher/fleet/internal/cmd" + "github.com/rancher/fleet/internal/cmd/controller/clustermonitor" "github.com/rancher/fleet/internal/cmd/controller/reconciler" "github.com/rancher/fleet/internal/cmd/controller/target" + fleetcfg "github.com/rancher/fleet/internal/config" "github.com/rancher/fleet/internal/manifest" "github.com/rancher/fleet/internal/metrics" "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" @@ -175,6 +178,19 @@ func start( return err } + if shardID == "" { // only one instance of the cluster status monitor needs to run. + setupLog.Info("starting cluster status monitor") + cfg := fleetcfg.Get() + // No need to run a similar check on the threshold, since its minimum value will be a multiple of the agent check-in + // interval anyway. + if cfg.ClusterMonitorInterval.Seconds() == 0 { + err := errors.New("cluster status monitor interval cannot be 0") + setupLog.Error(err, "cannot start cluster status monitor") + return err + } + go clustermonitor.Run(ctx, mgr.GetClient(), cfg.ClusterMonitorInterval.Duration, cfg.ClusterMonitorThreshold.Duration) + } + setupLog.Info("starting job scheduler") jobCtx, cancel := context.WithCancel(ctx) defer cancel() diff --git a/internal/config/config.go b/internal/config/config.go index 6187a22e41..e7cd672aa8 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -121,6 +121,15 @@ type Config struct { // AgentWorkers specifies the maximum number of workers for each agent reconciler. AgentWorkers AgentWorkers `json:"agentWorkers,omitempty"` + + // ClusterMonitorInterval determines how often the cluster monitor will check for offline downstream clusters. + ClusterMonitorInterval metav1.Duration `json:"clusterMonitorInterval,omitempty"` + + // ClusterMonitorThreshold determines how long must have elapsed since a downstream cluster's Fleet agent last + // reported its status to the management cluster, before that downstream cluster is considered offline. + // If this configured value is shorter than three times the agent check-in interval, then that check-in + // interval-based value will be used instead to prevent false positives. + ClusterMonitorThreshold metav1.Duration `json:"clusterMonitorThreshold.omitempty"` } type AgentWorkers struct {