From 67cef7b59cc2ea18433bc033f80353a104a18fbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Thu, 3 Oct 2024 16:17:46 +0200 Subject: [PATCH 01/14] Add cluster status monitor to controller This allows the Fleet controller to detect offline clusters and update statuses of bundle deployments targeting offline clusters. Next to do: * understand how bundle deployment status updates should be propagated to bundles (which currently simply appear as `Modified`) and further up * write tests (eg. integration tests, updating cluster status by hand?) * set sensible defaults (eg. monitoring interval higher, and threshold higher than agent's cluster status reporting interval) --- internal/cmd/controller/operator.go | 108 ++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/internal/cmd/controller/operator.go b/internal/cmd/controller/operator.go index 73881c62a9..d5363ef7ab 100644 --- a/internal/cmd/controller/operator.go +++ b/internal/cmd/controller/operator.go @@ -3,6 +3,7 @@ package controller import ( "context" "fmt" + "time" "github.com/reugn/go-quartz/quartz" @@ -14,12 +15,17 @@ import ( "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" + "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + "github.com/rancher/fleet/internal/cmd/agent/deployer/monitor" ) var ( @@ -175,6 +181,9 @@ func start( return err } + setupLog.Info("starting cluster status monitor") + go runClusterStatusMonitor(ctx, mgr.GetClient()) + setupLog.Info("starting job scheduler") jobCtx, cancel := context.WithCancel(ctx) defer cancel() @@ -191,3 +200,102 @@ func start( return nil } + +func runClusterStatusMonitor(ctx context.Context, c client.Client) { + threshold := 15 * time.Second // TODO load or hard-code sensible value + + logger := ctrl.Log.WithName("cluster status monitor") + + for { + select { + case <-ctx.Done(): + return + case <-time.After(threshold): + } + + clusters := &v1alpha1.ClusterList{} + if err := c.List(ctx, clusters); err != nil { + logger.Error(err, "Failed to get list of clusters") + continue + } + + for _, cluster := range clusters.Items { + lastSeen := cluster.Status.Agent.LastSeen + + // FIXME threshold should not be lower than cluster status refresh default value (15 min) + + // XXX: should the same value be used for both the polling interval and the threshold? + logger.Info("Checking cluster status", "cluster", cluster.Name, "last seen", lastSeen.UTC().String()) + + // XXX: do we want to run this more than once per cluster, updating the timestamp each time? + // Or would it make sense to keep the oldest possible timestamp in place, for users to know since when the + // cluster is offline? + + // lastSeen being 0 would typically mean that the cluster is not registered yet, in which case bundle + // deployments should not be deployed there. + if lastSeen.IsZero() || time.Now().UTC().Sub(lastSeen.UTC()) < threshold { + continue + } + + logger.Info("Detected offline cluster", "cluster", cluster.Name) + + // Cluster is offline + bundleDeployments := &v1alpha1.BundleDeploymentList{} + if err := c.List(ctx, bundleDeployments, client.InNamespace(cluster.Status.Namespace)); err != nil { + logger.Error( + err, + "Failed to get list of bundle deployments for offline cluster", + "cluster", + cluster.Name, + "namespace", + cluster.Status.Namespace, + ) + continue + } + + // These updates should not conflict with those done by the bundle deployment reconciler (offline vs online + // clusters). + for _, bd := range bundleDeployments.Items { + logger.Info("Updating bundle deployment in offline cluster", "cluster", cluster.Name, "bundledeployment", bd.Name) + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + t := &v1alpha1.BundleDeployment{} + nsn := types.NamespacedName{Name: bd.Name, Namespace: bd.Namespace} + logger.Info("[DEBUG] getting bundle deployment", "cluster", cluster.Name, "bundledeployment", bd.Name) + if err := c.Get(ctx, nsn, t); err != nil { + return err + } + t.Status = bd.Status + // TODO status updates: update condition with type Ready + t.Status.ModifiedStatus = nil + + for _, cond := range bd.Status.Conditions { + if cond.Type != "Ready" { + continue + } + + // FIXME: avoid relying on agent pkg for this? + mc := monitor.Cond(v1alpha1.BundleDeploymentConditionReady) + mc.SetError(&bd.Status, "Cluster offline", fmt.Errorf("cluster is offline")) + //cond.LastUpdated(status, time.Now().UTC().Format(time.RFC3339)) + } + + logger.Info("[DEBUG] updating bundle deployment status", "cluster", cluster.Name, "bundledeployment", bd.Name) + + return c.Status().Update(ctx, t) + }) + if err != nil { + logger.Error( + err, + "Failed to update bundle deployment status for offline cluster", + "bundledeployment", + bd.Name, + "cluster", + cluster.Name, + "namespace", + cluster.Status.Namespace, + ) + } + } + } + } +} From 2aed58306ca74850fcbd0aba8f92b5f82fc6971c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Fri, 4 Oct 2024 11:26:29 +0200 Subject: [PATCH 02/14] Reflect offline cluster state in more bundle deployment status fields This enables that state to be reflected upwards in bundle, GitRepo, cluster and cluster group statuses. --- internal/cmd/controller/operator.go | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/internal/cmd/controller/operator.go b/internal/cmd/controller/operator.go index d5363ef7ab..c64e078fed 100644 --- a/internal/cmd/controller/operator.go +++ b/internal/cmd/controller/operator.go @@ -260,27 +260,30 @@ func runClusterStatusMonitor(ctx context.Context, c client.Client) { err := retry.RetryOnConflict(retry.DefaultRetry, func() error { t := &v1alpha1.BundleDeployment{} nsn := types.NamespacedName{Name: bd.Name, Namespace: bd.Namespace} - logger.Info("[DEBUG] getting bundle deployment", "cluster", cluster.Name, "bundledeployment", bd.Name) if err := c.Get(ctx, nsn, t); err != nil { return err } t.Status = bd.Status - // TODO status updates: update condition with type Ready + // Any information about resources living in an offline cluster is likely to be + // outdated. t.Status.ModifiedStatus = nil + t.Status.NonReadyStatus = nil for _, cond := range bd.Status.Conditions { - if cond.Type != "Ready" { - continue - } + switch cond.Type { + // XXX: which messages do we want to set and where? + case "Ready": + // FIXME: avoid relying on agent pkg for this? + mc := monitor.Cond(v1alpha1.BundleDeploymentConditionReady) + mc.SetError(&bd.Status, "Cluster offline", fmt.Errorf("cluster is offline")) + // XXX: do we want to set Deployed and Installed conditions as well? + case "Monitored": + mc := monitor.Cond(v1alpha1.BundleDeploymentConditionMonitored) + mc.SetError(&bd.Status, "Cluster offline", fmt.Errorf("cluster is offline")) - // FIXME: avoid relying on agent pkg for this? - mc := monitor.Cond(v1alpha1.BundleDeploymentConditionReady) - mc.SetError(&bd.Status, "Cluster offline", fmt.Errorf("cluster is offline")) - //cond.LastUpdated(status, time.Now().UTC().Format(time.RFC3339)) + } } - logger.Info("[DEBUG] updating bundle deployment status", "cluster", cluster.Name, "bundledeployment", bd.Name) - return c.Status().Update(ctx, t) }) if err != nil { From 4f3a974b7828ddabd6995b2f0723c8883c332282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Mon, 7 Oct 2024 10:21:28 +0200 Subject: [PATCH 03/14] Move cluster status monitor to separate package This also provides unit tests for detecting offline clusters. --- .../cmd/controller/clustermonitor/monitor.go | 125 ++++++ .../controller/clustermonitor/monitor_test.go | 365 ++++++++++++++++++ internal/cmd/controller/operator.go | 110 +----- 3 files changed, 492 insertions(+), 108 deletions(-) create mode 100644 internal/cmd/controller/clustermonitor/monitor.go create mode 100644 internal/cmd/controller/clustermonitor/monitor_test.go diff --git a/internal/cmd/controller/clustermonitor/monitor.go b/internal/cmd/controller/clustermonitor/monitor.go new file mode 100644 index 0000000000..2acea03c7d --- /dev/null +++ b/internal/cmd/controller/clustermonitor/monitor.go @@ -0,0 +1,125 @@ +package clustermonitor + +import ( + "context" + "fmt" + "time" + + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/rancher/fleet/internal/cmd/agent/deployer/monitor" + "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" +) + +// Run monitors Fleet cluster resources' agent last seen dates. If a cluster's agent was last seen longer ago than +// threshold, then Run updates statuses of all bundle deployments targeting that cluster, to reflect the fact that the +// cluster is offline. This prevents those bundle deployments from displaying outdated status information. +// +// Bundle deployment status updates done here are unlikely to conflict with those done by the bundle deployment +// reconciler, which are either run from an online target cluster (from its Fleet agent) or triggered by other status +// updates such as this one (eg. bundle deployment reconciler living in the Fleet controller). +func Run(ctx context.Context, c client.Client, threshold time.Duration) { + for { + select { + case <-ctx.Done(): + return + case <-time.After(threshold): + } + + // XXX: should the same value be used for both the polling interval and the threshold? + UpdateOfflineBundleDeployments(ctx, c, threshold) + } +} + +func UpdateOfflineBundleDeployments(ctx context.Context, c client.Client, threshold time.Duration) { + logger := ctrl.Log.WithName("cluster status monitor") + + clusters := &v1alpha1.ClusterList{} + if err := c.List(ctx, clusters); err != nil { + logger.Error(err, "Failed to get list of clusters") + return + } + + for _, cluster := range clusters.Items { + lastSeen := cluster.Status.Agent.LastSeen + + // FIXME threshold should not be lower than cluster status refresh default value (15 min) + + logger.Info("Checking cluster status", "cluster", cluster.Name, "last seen", lastSeen.UTC().String()) + + // XXX: do we want to run this more than once per cluster, updating the timestamp each time? + // Or would it make sense to keep the oldest possible timestamp in place, for users to know since when the + // cluster is offline? + + // lastSeen being 0 would typically mean that the cluster is not registered yet, in which case bundle + // deployments should not be deployed there. + if lastSeen.IsZero() || time.Now().UTC().Sub(lastSeen.UTC()) < threshold { + continue + } + + logger.Info("Detected offline cluster", "cluster", cluster.Name) + + // Cluster is offline + bundleDeployments := &v1alpha1.BundleDeploymentList{} + if err := c.List(ctx, bundleDeployments, client.InNamespace(cluster.Status.Namespace)); err != nil { + logger.Error( + err, + "Failed to get list of bundle deployments for offline cluster", + "cluster", + cluster.Name, + "namespace", + cluster.Status.Namespace, + ) + continue + } + + for _, bd := range bundleDeployments.Items { + logger.Info("Updating bundle deployment in offline cluster", "cluster", cluster.Name, "bundledeployment", bd.Name) + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + t := &v1alpha1.BundleDeployment{} + nsn := types.NamespacedName{Name: bd.Name, Namespace: bd.Namespace} + if err := c.Get(ctx, nsn, t); err != nil { + return err + } + t.Status = bd.Status + // Any information about resources living in an offline cluster is likely to be + // outdated. + t.Status.ModifiedStatus = nil + t.Status.NonReadyStatus = nil + + for _, cond := range bd.Status.Conditions { + switch cond.Type { + // XXX: which messages do we want to set and where? + case "Ready": + // FIXME: avoid relying on agent pkg for this? + mc := monitor.Cond(v1alpha1.BundleDeploymentConditionReady) + mc.SetError(&t.Status, "Cluster offline", fmt.Errorf("cluster is offline")) + // XXX: do we want to set Deployed and Installed conditions as well? + // XXX: should we set conditions to `Unknown`? + case "Monitored": + mc := monitor.Cond(v1alpha1.BundleDeploymentConditionMonitored) + mc.SetError(&t.Status, "Cluster offline", fmt.Errorf("cluster is offline")) + + } + } + + return c.Status().Update(ctx, t) + }) + if err != nil { + logger.Error( + err, + "Failed to update bundle deployment status for offline cluster", + "bundledeployment", + bd.Name, + "cluster", + cluster.Name, + "namespace", + cluster.Status.Namespace, + ) + } + } + } +} diff --git a/internal/cmd/controller/clustermonitor/monitor_test.go b/internal/cmd/controller/clustermonitor/monitor_test.go new file mode 100644 index 0000000000..c72566071b --- /dev/null +++ b/internal/cmd/controller/clustermonitor/monitor_test.go @@ -0,0 +1,365 @@ +package clustermonitor_test + +import ( + "context" + "strings" + "testing" + "time" + + "go.uber.org/mock/gomock" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/rancher/fleet/internal/cmd/controller/clustermonitor" + "github.com/rancher/fleet/internal/mocks" + "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "github.com/rancher/wrangler/v3/pkg/genericcondition" +) + +const threshold = 1 * time.Second + +// BDStatusMatcher implements a gomock matcher for bundle deployment status. +// The matcher checks for empty modified and non-ready status, as well as offline state through Ready and Monitored +// conditions. +type BDStatusMatcher struct { +} + +func (m BDStatusMatcher) Matches(x interface{}) bool { + bd, ok := x.(*v1alpha1.BundleDeployment) + if !ok { + return false + } + + bdStatus := bd.Status + + if bd == nil { + return false + } + + if bdStatus.ModifiedStatus != nil || bdStatus.NonReadyStatus != nil { + return false + } + + foundReady, foundMonitored := false, false + for _, cond := range bdStatus.Conditions { + if cond.Type == "Ready" { + foundReady = true + + if !strings.Contains(cond.Message, "offline") { + return false + } + } else if cond.Type == "Monitored" { + foundMonitored = true + + if !strings.Contains(cond.Message, "offline") { + return false + } + } + + } + + return foundReady && foundMonitored +} + +func (m BDStatusMatcher) String() string { + return "Bundle deployment status for offline cluster" +} + +type clusterWithOfflineMarker struct { + cluster v1alpha1.Cluster + isOffline bool +} + +func Test_Run(t *testing.T) { + cases := []struct { + name string + clusters []clusterWithOfflineMarker + listClustersErr error + bundleDeployments map[string][]v1alpha1.BundleDeployment // indexed by cluster namespace + listBDErr error + }{ + { + name: "no cluster", + clusters: nil, + listClustersErr: nil, + bundleDeployments: nil, + listBDErr: nil, + }, + { + name: "no offline cluster", + clusters: []clusterWithOfflineMarker{ + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + // eg. not yet registered downstream cluster + Agent: v1alpha1.AgentStatus{ /* LastSeen is zero */ }, + }, + }, + }, + }, + listClustersErr: nil, + bundleDeployments: nil, + listBDErr: nil, + }, + { + name: "one offline cluster", + clusters: []clusterWithOfflineMarker{ + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-10 * threshold)}, + }, + Namespace: "clusterns1", + }, + }, + isOffline: true, + }, + }, + listClustersErr: nil, + bundleDeployments: map[string][]v1alpha1.BundleDeployment{ + "clusterns1": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns1", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + }, + listBDErr: nil, + }, + { + name: "multiple offline clusters", + clusters: []clusterWithOfflineMarker{ + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-10 * threshold)}, + }, + Namespace: "clusterns1", + }, + }, + isOffline: true, + }, + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster2", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-5 * threshold)}, + }, + Namespace: "clusterns2", + }, + }, + isOffline: true, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster3", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-3 * threshold)}, + }, + Namespace: "clusterns3", + }, + }, + isOffline: true, + }, + }, + listClustersErr: nil, + bundleDeployments: map[string][]v1alpha1.BundleDeployment{ + "clusterns1": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns1", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + "clusterns2": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns2", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "my-other-bundledeployment", + Namespace: "clusterns2", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + "clusterns3": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns3", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + }, + listBDErr: nil, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + testClient := mocks.NewMockClient(ctrl) + ctx, cancel := context.WithCancel(context.Background()) + + defer cancel() + + testClient.EXPECT().List(ctx, gomock.Any(), gomock.Any()).DoAndReturn( + func(ctx context.Context, list *v1alpha1.ClusterList, opts ...client.ListOption) error { + for _, cl := range tc.clusters { + list.Items = append(list.Items, cl.cluster) + } + + return tc.listClustersErr + }, + ) + + for _, cl := range tc.clusters { + if !cl.isOffline { + continue + } + + bundleDeplsForCluster := tc.bundleDeployments[cl.cluster.Status.Namespace] + testClient.EXPECT().List(ctx, gomock.Any(), gomock.Any()).DoAndReturn( + func(ctx context.Context, list *v1alpha1.BundleDeploymentList, opts ...client.ListOption) error { + list.Items = append(list.Items, bundleDeplsForCluster...) + + return tc.listBDErr + }, + ) + + for _, bd := range bundleDeplsForCluster { + testClient.EXPECT().Get(ctx, gomock.Any(), gomock.Any()).DoAndReturn( + func(ctx context.Context, nsn types.NamespacedName, b *v1alpha1.BundleDeployment, opts ...client.GetOption) error { + b = &bd + + return nil + }, + ) + + srw := mocks.NewMockSubResourceWriter(ctrl) + testClient.EXPECT().Status().Return(srw) + + srw.EXPECT().Update(ctx, &BDStatusMatcher{}).Return(nil) + } + } + + clustermonitor.UpdateOfflineBundleDeployments(ctx, testClient, threshold) + }) + } +} diff --git a/internal/cmd/controller/operator.go b/internal/cmd/controller/operator.go index c64e078fed..a06c1528c0 100644 --- a/internal/cmd/controller/operator.go +++ b/internal/cmd/controller/operator.go @@ -8,6 +8,7 @@ import ( "github.com/reugn/go-quartz/quartz" "github.com/rancher/fleet/internal/cmd" + "github.com/rancher/fleet/internal/cmd/controller/clustermonitor" "github.com/rancher/fleet/internal/cmd/controller/reconciler" "github.com/rancher/fleet/internal/cmd/controller/target" "github.com/rancher/fleet/internal/manifest" @@ -15,17 +16,12 @@ import ( "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" - "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" - - "github.com/rancher/fleet/internal/cmd/agent/deployer/monitor" ) var ( @@ -182,7 +178,7 @@ func start( } setupLog.Info("starting cluster status monitor") - go runClusterStatusMonitor(ctx, mgr.GetClient()) + go clustermonitor.Run(ctx, mgr.GetClient(), 15*time.Second) // TODO load or hard-code sensible value setupLog.Info("starting job scheduler") jobCtx, cancel := context.WithCancel(ctx) @@ -200,105 +196,3 @@ func start( return nil } - -func runClusterStatusMonitor(ctx context.Context, c client.Client) { - threshold := 15 * time.Second // TODO load or hard-code sensible value - - logger := ctrl.Log.WithName("cluster status monitor") - - for { - select { - case <-ctx.Done(): - return - case <-time.After(threshold): - } - - clusters := &v1alpha1.ClusterList{} - if err := c.List(ctx, clusters); err != nil { - logger.Error(err, "Failed to get list of clusters") - continue - } - - for _, cluster := range clusters.Items { - lastSeen := cluster.Status.Agent.LastSeen - - // FIXME threshold should not be lower than cluster status refresh default value (15 min) - - // XXX: should the same value be used for both the polling interval and the threshold? - logger.Info("Checking cluster status", "cluster", cluster.Name, "last seen", lastSeen.UTC().String()) - - // XXX: do we want to run this more than once per cluster, updating the timestamp each time? - // Or would it make sense to keep the oldest possible timestamp in place, for users to know since when the - // cluster is offline? - - // lastSeen being 0 would typically mean that the cluster is not registered yet, in which case bundle - // deployments should not be deployed there. - if lastSeen.IsZero() || time.Now().UTC().Sub(lastSeen.UTC()) < threshold { - continue - } - - logger.Info("Detected offline cluster", "cluster", cluster.Name) - - // Cluster is offline - bundleDeployments := &v1alpha1.BundleDeploymentList{} - if err := c.List(ctx, bundleDeployments, client.InNamespace(cluster.Status.Namespace)); err != nil { - logger.Error( - err, - "Failed to get list of bundle deployments for offline cluster", - "cluster", - cluster.Name, - "namespace", - cluster.Status.Namespace, - ) - continue - } - - // These updates should not conflict with those done by the bundle deployment reconciler (offline vs online - // clusters). - for _, bd := range bundleDeployments.Items { - logger.Info("Updating bundle deployment in offline cluster", "cluster", cluster.Name, "bundledeployment", bd.Name) - err := retry.RetryOnConflict(retry.DefaultRetry, func() error { - t := &v1alpha1.BundleDeployment{} - nsn := types.NamespacedName{Name: bd.Name, Namespace: bd.Namespace} - if err := c.Get(ctx, nsn, t); err != nil { - return err - } - t.Status = bd.Status - // Any information about resources living in an offline cluster is likely to be - // outdated. - t.Status.ModifiedStatus = nil - t.Status.NonReadyStatus = nil - - for _, cond := range bd.Status.Conditions { - switch cond.Type { - // XXX: which messages do we want to set and where? - case "Ready": - // FIXME: avoid relying on agent pkg for this? - mc := monitor.Cond(v1alpha1.BundleDeploymentConditionReady) - mc.SetError(&bd.Status, "Cluster offline", fmt.Errorf("cluster is offline")) - // XXX: do we want to set Deployed and Installed conditions as well? - case "Monitored": - mc := monitor.Cond(v1alpha1.BundleDeploymentConditionMonitored) - mc.SetError(&bd.Status, "Cluster offline", fmt.Errorf("cluster is offline")) - - } - } - - return c.Status().Update(ctx, t) - }) - if err != nil { - logger.Error( - err, - "Failed to update bundle deployment status for offline cluster", - "bundledeployment", - bd.Name, - "cluster", - cluster.Name, - "namespace", - cluster.Status.Namespace, - ) - } - } - } - } -} From 7aea167e056fae0b30e62ca85ea2131ea94d12d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Mon, 7 Oct 2024 11:54:01 +0200 Subject: [PATCH 04/14] Set sensible configuration for cluster status monitor The frequency of cluster status checks is currently hard-coded to 1 minute, but could be made configurable. The threshold for considering a cluster offline now explicitly depends on how often agents report their statuses to the management cluster. Changes to that configured interval should impact the cluster status monitor, which would take the new value into account from its next run onwards. --- .../cmd/controller/clustermonitor/monitor.go | 21 ++++++++++++------- internal/cmd/controller/operator.go | 3 +-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/internal/cmd/controller/clustermonitor/monitor.go b/internal/cmd/controller/clustermonitor/monitor.go index 2acea03c7d..628c61c416 100644 --- a/internal/cmd/controller/clustermonitor/monitor.go +++ b/internal/cmd/controller/clustermonitor/monitor.go @@ -11,25 +11,34 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/rancher/fleet/internal/cmd/agent/deployer/monitor" + "github.com/rancher/fleet/internal/config" "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" ) // Run monitors Fleet cluster resources' agent last seen dates. If a cluster's agent was last seen longer ago than -// threshold, then Run updates statuses of all bundle deployments targeting that cluster, to reflect the fact that the -// cluster is offline. This prevents those bundle deployments from displaying outdated status information. +// a certain threshold, then Run updates statuses of all bundle deployments targeting that cluster, to reflect the fact +// that the cluster is offline. This prevents those bundle deployments from displaying outdated status information. +// +// The threshold is computed based on the configured agent check-in interval, plus a 10 percent margin. +// Therefore, this function requires configuration to have been loaded into the config package using `Load` before +// running. // // Bundle deployment status updates done here are unlikely to conflict with those done by the bundle deployment // reconciler, which are either run from an online target cluster (from its Fleet agent) or triggered by other status // updates such as this one (eg. bundle deployment reconciler living in the Fleet controller). -func Run(ctx context.Context, c client.Client, threshold time.Duration) { +func Run(ctx context.Context, c client.Client) { for { select { case <-ctx.Done(): return - case <-time.After(threshold): + case <-time.After(time.Minute): // XXX: should this be configurable? } - // XXX: should the same value be used for both the polling interval and the threshold? + cfg := config.Get() // This enables config changes to take effect + + // Add a 10% margin, which is arbitrary but should reduce the risk of false positives. + threshold := time.Duration(cfg.AgentCheckinInterval.Seconds() * 1.1) + UpdateOfflineBundleDeployments(ctx, c, threshold) } } @@ -46,8 +55,6 @@ func UpdateOfflineBundleDeployments(ctx context.Context, c client.Client, thresh for _, cluster := range clusters.Items { lastSeen := cluster.Status.Agent.LastSeen - // FIXME threshold should not be lower than cluster status refresh default value (15 min) - logger.Info("Checking cluster status", "cluster", cluster.Name, "last seen", lastSeen.UTC().String()) // XXX: do we want to run this more than once per cluster, updating the timestamp each time? diff --git a/internal/cmd/controller/operator.go b/internal/cmd/controller/operator.go index a06c1528c0..66ba3b234e 100644 --- a/internal/cmd/controller/operator.go +++ b/internal/cmd/controller/operator.go @@ -3,7 +3,6 @@ package controller import ( "context" "fmt" - "time" "github.com/reugn/go-quartz/quartz" @@ -178,7 +177,7 @@ func start( } setupLog.Info("starting cluster status monitor") - go clustermonitor.Run(ctx, mgr.GetClient(), 15*time.Second) // TODO load or hard-code sensible value + go clustermonitor.Run(ctx, mgr.GetClient()) setupLog.Info("starting job scheduler") jobCtx, cancel := context.WithCancel(ctx) From 53e29ebcea5bac0e5f7522620847171c572a43ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Mon, 7 Oct 2024 13:04:03 +0200 Subject: [PATCH 05/14] Eliminate linting errors This fixes an error and ignores a few others to make the linter happy. --- .../controller/clustermonitor/monitor_test.go | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/internal/cmd/controller/clustermonitor/monitor_test.go b/internal/cmd/controller/clustermonitor/monitor_test.go index c72566071b..db85c32e5f 100644 --- a/internal/cmd/controller/clustermonitor/monitor_test.go +++ b/internal/cmd/controller/clustermonitor/monitor_test.go @@ -27,16 +27,12 @@ type BDStatusMatcher struct { func (m BDStatusMatcher) Matches(x interface{}) bool { bd, ok := x.(*v1alpha1.BundleDeployment) - if !ok { + if !ok || bd == nil { return false } bdStatus := bd.Status - if bd == nil { - return false - } - if bdStatus.ModifiedStatus != nil || bdStatus.NonReadyStatus != nil { return false } @@ -71,7 +67,7 @@ type clusterWithOfflineMarker struct { isOffline bool } -func Test_Run(t *testing.T) { +func Test_Run(t *testing.T) { //nolint: funlen // this is a test function, its length should not be an issue cases := []struct { name string clusters []clusterWithOfflineMarker @@ -345,8 +341,16 @@ func Test_Run(t *testing.T) { for _, bd := range bundleDeplsForCluster { testClient.EXPECT().Get(ctx, gomock.Any(), gomock.Any()).DoAndReturn( - func(ctx context.Context, nsn types.NamespacedName, b *v1alpha1.BundleDeployment, opts ...client.GetOption) error { - b = &bd + func( + ctx context.Context, + nsn types.NamespacedName, + // b's initial value is never used, but the variable needs to be + // named for its value to be overwritten with values we care + // about, to simulate a response from the API server. + b *v1alpha1.BundleDeployment, //nolint: staticcheck + opts ...client.GetOption, + ) error { + b = &bd //nolint: ineffassign,staticcheck // the value is used by the implementation, not directly by the tests. return nil }, From b608c5200ac094825734943a21c02a1464f7b794 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Tue, 8 Oct 2024 11:44:25 +0200 Subject: [PATCH 06/14] Check for condition status and reason This adds checks ensuring that for offline clusters, for which calls to update bundle deployment statuses are expected, those statuses contain `Ready` and `Monitored` conditions with status `False` and reasons reflecting the cluster's offline status. --- internal/cmd/controller/clustermonitor/monitor_test.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/internal/cmd/controller/clustermonitor/monitor_test.go b/internal/cmd/controller/clustermonitor/monitor_test.go index db85c32e5f..36b54937d7 100644 --- a/internal/cmd/controller/clustermonitor/monitor_test.go +++ b/internal/cmd/controller/clustermonitor/monitor_test.go @@ -42,13 +42,17 @@ func (m BDStatusMatcher) Matches(x interface{}) bool { if cond.Type == "Ready" { foundReady = true - if !strings.Contains(cond.Message, "offline") { + if cond.Status != "False" || + !strings.Contains(cond.Reason, "offline") || + !strings.Contains(cond.Message, "offline") { return false } } else if cond.Type == "Monitored" { foundMonitored = true - if !strings.Contains(cond.Message, "offline") { + if cond.Status != "False" || + !strings.Contains(cond.Reason, "offline") || + !strings.Contains(cond.Message, "offline") { return false } } From afb19ebe10e43c96140d7a66bb944b3e5e9f9544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Mon, 14 Oct 2024 10:47:40 +0200 Subject: [PATCH 07/14] Prevent agent check-in interval from being 0 This ensures that creating a new agent bundle fails with an agent check-in interval set to 0. --- .../controllers/manageagent/manageagent.go | 4 ++++ .../manageagent/manageagent_test.go | 20 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent.go b/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent.go index f180761844..857ebb3ed0 100644 --- a/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent.go +++ b/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent.go @@ -256,6 +256,10 @@ func (h *handler) newAgentBundle(ns string, cluster *fleet.Cluster) (runtime.Obj agentNamespace = cluster.Spec.AgentNamespace } + if cfg.AgentCheckinInterval.Seconds() == 0 { + return nil, fmt.Errorf("agent check-in interval cannot be 0") + } + // Notice we only set the agentScope when it's a non-default agentNamespace. This is for backwards compatibility // for when we didn't have agent scope before objs := agent.Manifest( diff --git a/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent_test.go b/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent_test.go index 946f9ec47c..b3bee2d814 100644 --- a/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent_test.go +++ b/internal/cmd/controller/agentmanagement/controllers/manageagent/manageagent_test.go @@ -1,17 +1,37 @@ package manageagent import ( + "strings" "testing" + "time" "github.com/rancher/wrangler/v3/pkg/generic/fake" "go.uber.org/mock/gomock" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + "github.com/rancher/fleet/internal/config" fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" ) +func TestNewAgentBundle(t *testing.T) { + config.Set(&config.Config{AgentCheckinInterval: metav1.Duration{Duration: 0 * time.Second}}) + + h := handler{systemNamespace: "blah"} + obj, err := h.newAgentBundle("foo", &fleet.Cluster{Spec: fleet.ClusterSpec{AgentNamespace: "bar"}}) + + if obj != nil { + t.Fatalf("expected obj returned by newAgentBundle to be nil") + } + + expectedStr := "interval cannot be 0" + if !strings.Contains(err.Error(), expectedStr) { + t.Fatalf("expected error returned by newAgentBundle to contain %q", expectedStr) + } +} + func TestOnClusterChangeAffinity(t *testing.T) { ctrl := gomock.NewController(t) namespaces := fake.NewMockNonNamespacedControllerInterface[*corev1.Namespace, *corev1.NamespaceList](ctrl) From abe40a7c2e2b65758db41a0a6dc6162e016f4a78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Mon, 14 Oct 2024 11:08:12 +0200 Subject: [PATCH 08/14] Prevent check-in interval from being 0 when importing cluster This adds a check on the agent check-in interval to cluster import, for consistency with agent bundle updates. --- .../controller/agentmanagement/controllers/cluster/import.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/cmd/controller/agentmanagement/controllers/cluster/import.go b/internal/cmd/controller/agentmanagement/controllers/cluster/import.go index 9541683ebe..c7c13a97ae 100644 --- a/internal/cmd/controller/agentmanagement/controllers/cluster/import.go +++ b/internal/cmd/controller/agentmanagement/controllers/cluster/import.go @@ -236,6 +236,10 @@ func (i *importHandler) importCluster(cluster *fleet.Cluster, status fleet.Clust apiServerCA = secret.Data[config.APIServerCAKey] ) + if cfg.AgentCheckinInterval.Seconds() == 0 { + return status, fmt.Errorf("agent check-in interval cannot be 0") + } + if apiServerURL == "" { if len(cfg.APIServerURL) == 0 { return status, fmt.Errorf("missing apiServerURL in fleet config for cluster auto registration") From 946ab874ee244e9d89cd1791e72ecdec8a53b4ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Mon, 14 Oct 2024 11:31:38 +0200 Subject: [PATCH 09/14] Make cluster monitor interval and threshold configurable This enables users to determine how often the Fleet controller will check for offline clusters, and based on which threshold. If the configured threshold is below the triple of the check-in interval, that tripled value will be used instead. --- charts/fleet/templates/configmap.yaml | 2 ++ charts/fleet/values.yaml | 9 +++++++++ internal/cmd/controller/clustermonitor/monitor.go | 14 ++++++++------ internal/cmd/controller/operator.go | 12 +++++++++++- internal/config/config.go | 9 +++++++++ 5 files changed, 39 insertions(+), 7 deletions(-) diff --git a/charts/fleet/templates/configmap.yaml b/charts/fleet/templates/configmap.yaml index 8f0a5aff33..875c1505e3 100644 --- a/charts/fleet/templates/configmap.yaml +++ b/charts/fleet/templates/configmap.yaml @@ -16,6 +16,8 @@ data: "bundledeployment": "{{.Values.agent.reconciler.workers.bundledeployment}}", "drift": "{{.Values.agent.reconciler.workers.drift}}" }, + "clusterMonitorInterval": "{{.Values.clusterMonitorInterval}}", + "clusterMonitorThreshold": "{{.Values.clusterMonitorThreshold}}", {{ if .Values.garbageCollectionInterval }} "garbageCollectionInterval": "{{.Values.garbageCollectionInterval}}", {{ end }} diff --git a/charts/fleet/values.yaml b/charts/fleet/values.yaml index 7094877a47..4a1e1c2e0e 100644 --- a/charts/fleet/values.yaml +++ b/charts/fleet/values.yaml @@ -23,6 +23,15 @@ agentTLSMode: "system-store" # A duration string for how often agents should report a heartbeat agentCheckinInterval: "15m" +# Determines how long must have elapsed since a downstream cluster's Fleet agent last reported its status to the +# management cluster, before that downstream cluster is considered offline. +# If this configured value is shorter than three times the agent check-in interval, then that check-in +# interval-based value will be used instead to prevent false positives. +clusterMonitorThreshold: "45m" + +# Determines how often the cluster monitor will check for offline downstream clusters. +clusterMonitorInterval: "10m" + # The amount of time that agents will wait before they clean up old Helm releases. # A non-existent value or 0 will result in an interval of 15 minutes. garbageCollectionInterval: "15m" diff --git a/internal/cmd/controller/clustermonitor/monitor.go b/internal/cmd/controller/clustermonitor/monitor.go index 628c61c416..d500105915 100644 --- a/internal/cmd/controller/clustermonitor/monitor.go +++ b/internal/cmd/controller/clustermonitor/monitor.go @@ -3,6 +3,7 @@ package clustermonitor import ( "context" "fmt" + "math" "time" "k8s.io/apimachinery/pkg/types" @@ -19,27 +20,28 @@ import ( // a certain threshold, then Run updates statuses of all bundle deployments targeting that cluster, to reflect the fact // that the cluster is offline. This prevents those bundle deployments from displaying outdated status information. // -// The threshold is computed based on the configured agent check-in interval, plus a 10 percent margin. +// A cluster will be considered offline if its Fleet agent has not reported its status for more than: +// - three times the agent check-in interval +// - or any larger configured interval. // Therefore, this function requires configuration to have been loaded into the config package using `Load` before // running. // // Bundle deployment status updates done here are unlikely to conflict with those done by the bundle deployment // reconciler, which are either run from an online target cluster (from its Fleet agent) or triggered by other status // updates such as this one (eg. bundle deployment reconciler living in the Fleet controller). -func Run(ctx context.Context, c client.Client) { +func Run(ctx context.Context, c client.Client, interval, threshold time.Duration) { for { select { case <-ctx.Done(): return - case <-time.After(time.Minute): // XXX: should this be configurable? + case <-time.After(interval): } cfg := config.Get() // This enables config changes to take effect - // Add a 10% margin, which is arbitrary but should reduce the risk of false positives. - threshold := time.Duration(cfg.AgentCheckinInterval.Seconds() * 1.1) + thresholdSecs := math.Max(cfg.AgentCheckinInterval.Seconds()*3, threshold.Seconds()) - UpdateOfflineBundleDeployments(ctx, c, threshold) + UpdateOfflineBundleDeployments(ctx, c, time.Second*time.Duration(thresholdSecs)) } } diff --git a/internal/cmd/controller/operator.go b/internal/cmd/controller/operator.go index 66ba3b234e..e20f31439d 100644 --- a/internal/cmd/controller/operator.go +++ b/internal/cmd/controller/operator.go @@ -2,6 +2,7 @@ package controller import ( "context" + "errors" "fmt" "github.com/reugn/go-quartz/quartz" @@ -10,6 +11,7 @@ import ( "github.com/rancher/fleet/internal/cmd/controller/clustermonitor" "github.com/rancher/fleet/internal/cmd/controller/reconciler" "github.com/rancher/fleet/internal/cmd/controller/target" + fleetcfg "github.com/rancher/fleet/internal/config" "github.com/rancher/fleet/internal/manifest" "github.com/rancher/fleet/internal/metrics" "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" @@ -177,7 +179,15 @@ func start( } setupLog.Info("starting cluster status monitor") - go clustermonitor.Run(ctx, mgr.GetClient()) + cfg := fleetcfg.Get() + // No need to run a similar check on the threshold, since its minimum value will be a multiple of the agent check-in + // interval anyway. + if cfg.ClusterMonitorInterval.Seconds() == 0 { + err := errors.New("cluster status monitor interval cannot be 0") + setupLog.Error(err, "cannot start cluster status monitor") + return err + } + go clustermonitor.Run(ctx, mgr.GetClient(), cfg.ClusterMonitorInterval.Duration, cfg.ClusterMonitorThreshold.Duration) setupLog.Info("starting job scheduler") jobCtx, cancel := context.WithCancel(ctx) diff --git a/internal/config/config.go b/internal/config/config.go index 6187a22e41..2d3e47b509 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -121,6 +121,15 @@ type Config struct { // AgentWorkers specifies the maximum number of workers for each agent reconciler. AgentWorkers AgentWorkers `json:"agentWorkers,omitempty"` + + // ClusterMonitorInterval determines how often the cluster monitor will check for offline downstream clusters. + ClusterMonitorInterval metav1.Duration `json:"clusterMonitorInterval.omitempty"` + + // ClusterMonitorThreshold determines how long must have elapsed since a downstream cluster's Fleet agent last + // reported its status to the management cluster, before that downstream cluster is considered offline. + // If this configured value is shorter than three times the agent check-in interval, then that check-in + // interval-based value will be used instead to prevent false positives. + ClusterMonitorThreshold metav1.Duration `json:"clusterMonitorThreshold.omitempty"` } type AgentWorkers struct { From d2e32633a1a624f63edd81ae19ffa18c928bf274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Fri, 18 Oct 2024 12:57:21 +0200 Subject: [PATCH 10/14] Skip bundle deployment updates for already offline clusters This optimises updates to bundle deployments, running them only against clusters which bundle deployments are not yet marked as offline. --- .../cmd/controller/clustermonitor/monitor.go | 29 +++- .../controller/clustermonitor/monitor_test.go | 155 +++++++++++++++++- 2 files changed, 174 insertions(+), 10 deletions(-) diff --git a/internal/cmd/controller/clustermonitor/monitor.go b/internal/cmd/controller/clustermonitor/monitor.go index d500105915..26cfd54cde 100644 --- a/internal/cmd/controller/clustermonitor/monitor.go +++ b/internal/cmd/controller/clustermonitor/monitor.go @@ -2,7 +2,7 @@ package clustermonitor import ( "context" - "fmt" + "errors" "math" "time" @@ -16,6 +16,8 @@ import ( "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" ) +const offlineMsg = "cluster is offline" + // Run monitors Fleet cluster resources' agent last seen dates. If a cluster's agent was last seen longer ago than // a certain threshold, then Run updates statuses of all bundle deployments targeting that cluster, to reflect the fact // that the cluster is offline. This prevents those bundle deployments from displaying outdated status information. @@ -45,6 +47,9 @@ func Run(ctx context.Context, c client.Client, interval, threshold time.Duration } } +// UpdateOfflineBundleDeployments looks for offline clusters based on the provided threshold duration. For each cluster +// considered offline, this updates its bundle deployments' statuses accordingly. +// If a cluster's bundle deployments have already been marked as offline, they will be skipped. func UpdateOfflineBundleDeployments(ctx context.Context, c client.Client, threshold time.Duration) { logger := ctrl.Log.WithName("cluster status monitor") @@ -59,10 +64,6 @@ func UpdateOfflineBundleDeployments(ctx context.Context, c client.Client, thresh logger.Info("Checking cluster status", "cluster", cluster.Name, "last seen", lastSeen.UTC().String()) - // XXX: do we want to run this more than once per cluster, updating the timestamp each time? - // Or would it make sense to keep the oldest possible timestamp in place, for users to know since when the - // cluster is offline? - // lastSeen being 0 would typically mean that the cluster is not registered yet, in which case bundle // deployments should not be deployed there. if lastSeen.IsZero() || time.Now().UTC().Sub(lastSeen.UTC()) < threshold { @@ -85,7 +86,19 @@ func UpdateOfflineBundleDeployments(ctx context.Context, c client.Client, thresh continue } + bd_update: for _, bd := range bundleDeployments.Items { + for _, cond := range bd.Status.Conditions { + switch cond.Type { + case "Ready": + fallthrough + case "Monitored": + if cond.Message == offlineMsg { + break bd_update + } + } + } + logger.Info("Updating bundle deployment in offline cluster", "cluster", cluster.Name, "bundledeployment", bd.Name) err := retry.RetryOnConflict(retry.DefaultRetry, func() error { t := &v1alpha1.BundleDeployment{} @@ -101,16 +114,16 @@ func UpdateOfflineBundleDeployments(ctx context.Context, c client.Client, thresh for _, cond := range bd.Status.Conditions { switch cond.Type { - // XXX: which messages do we want to set and where? case "Ready": // FIXME: avoid relying on agent pkg for this? mc := monitor.Cond(v1alpha1.BundleDeploymentConditionReady) - mc.SetError(&t.Status, "Cluster offline", fmt.Errorf("cluster is offline")) + mc.SetError(&t.Status, "Cluster offline", errors.New(offlineMsg)) + mc.Unknown(&t.Status) // XXX: do we want to set Deployed and Installed conditions as well? // XXX: should we set conditions to `Unknown`? case "Monitored": mc := monitor.Cond(v1alpha1.BundleDeploymentConditionMonitored) - mc.SetError(&t.Status, "Cluster offline", fmt.Errorf("cluster is offline")) + mc.SetError(&t.Status, "Cluster offline", errors.New(offlineMsg)) } } diff --git a/internal/cmd/controller/clustermonitor/monitor_test.go b/internal/cmd/controller/clustermonitor/monitor_test.go index 36b54937d7..5a2106d865 100644 --- a/internal/cmd/controller/clustermonitor/monitor_test.go +++ b/internal/cmd/controller/clustermonitor/monitor_test.go @@ -67,8 +67,9 @@ func (m BDStatusMatcher) String() string { } type clusterWithOfflineMarker struct { - cluster v1alpha1.Cluster - isOffline bool + cluster v1alpha1.Cluster + isOffline bool + isAlreadyMarkedOffline bool } func Test_Run(t *testing.T) { //nolint: funlen // this is a test function, its length should not be an issue @@ -309,6 +310,150 @@ func Test_Run(t *testing.T) { //nolint: funlen // this is a test function, its l }, listBDErr: nil, }, + { + name: "multiple offline clusters, some of them already marked offline", + clusters: []clusterWithOfflineMarker{ + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-10 * threshold)}, + }, + Namespace: "clusterns1", + }, + }, + isOffline: true, + }, + { + cluster: v1alpha1.Cluster{ + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC()}, + }, + }, + }, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster2", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-5 * threshold)}, + }, + Namespace: "clusterns2", + }, + }, + isOffline: true, + isAlreadyMarkedOffline: true, + }, + { + cluster: v1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mycluster3", + }, + Status: v1alpha1.ClusterStatus{ + Agent: v1alpha1.AgentStatus{ + LastSeen: metav1.Time{Time: time.Now().UTC().Add(-3 * threshold)}, + }, + Namespace: "clusterns3", + }, + }, + isOffline: true, + }, + }, + listClustersErr: nil, + bundleDeployments: map[string][]v1alpha1.BundleDeployment{ + "clusterns1": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns1", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + "clusterns2": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns2", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + Message: "cluster is offline", + }, + { + Type: "Monitored", + Message: "cluster is offline", + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "my-other-bundledeployment", + Namespace: "clusterns2", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + Message: "cluster is offline", + }, + { + Type: "Monitored", + Message: "cluster is offline", + }, + }, + }, + }, + }, + "clusterns3": { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mybundledeployment", + Namespace: "clusterns3", + }, + Status: v1alpha1.BundleDeploymentStatus{ + Conditions: []genericcondition.GenericCondition{ + { + Type: "Ready", + }, + { + Type: "Monitored", + }, + }, + }, + }, + }, + }, + listBDErr: nil, + }, } for _, tc := range cases { @@ -343,6 +488,12 @@ func Test_Run(t *testing.T) { //nolint: funlen // this is a test function, its l }, ) + if cl.isAlreadyMarkedOffline { + // skip status updates for bundle deployments which have already been + // marked offline by a previous monitoring loop. + continue + } + for _, bd := range bundleDeplsForCluster { testClient.EXPECT().Get(ctx, gomock.Any(), gomock.Any()).DoAndReturn( func( From 1f98f3910ebf6d54847c577ce7ac7fefc52f0f8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Fri, 18 Oct 2024 13:12:15 +0200 Subject: [PATCH 11/14] Set Ready condition to Unknown for offline clusters This better reflects what is then known about workloads running in such clusters than `False`. --- internal/cmd/agent/deployer/monitor/condition.go | 8 ++++++++ internal/cmd/controller/clustermonitor/monitor.go | 2 -- internal/cmd/controller/clustermonitor/monitor_test.go | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/internal/cmd/agent/deployer/monitor/condition.go b/internal/cmd/agent/deployer/monitor/condition.go index 87c3e69ef2..e085a89b7c 100644 --- a/internal/cmd/agent/deployer/monitor/condition.go +++ b/internal/cmd/agent/deployer/monitor/condition.go @@ -44,6 +44,14 @@ func (c Cond) IsFalse(obj interface{}) bool { return getStatus(obj, string(c)) == "False" } +func (c Cond) Unknown(obj interface{}) { + setStatus(obj, string(c), "Unknown") +} + +func (c Cond) IsUnknown(obj interface{}) bool { + return getStatus(obj, string(c)) == "Unknown" +} + func (c Cond) Reason(obj interface{}, reason string) { cond := findOrCreateCond(obj, string(c)) getFieldValue(cond, "Reason").SetString(reason) diff --git a/internal/cmd/controller/clustermonitor/monitor.go b/internal/cmd/controller/clustermonitor/monitor.go index 26cfd54cde..86b8b15cc2 100644 --- a/internal/cmd/controller/clustermonitor/monitor.go +++ b/internal/cmd/controller/clustermonitor/monitor.go @@ -115,12 +115,10 @@ func UpdateOfflineBundleDeployments(ctx context.Context, c client.Client, thresh for _, cond := range bd.Status.Conditions { switch cond.Type { case "Ready": - // FIXME: avoid relying on agent pkg for this? mc := monitor.Cond(v1alpha1.BundleDeploymentConditionReady) mc.SetError(&t.Status, "Cluster offline", errors.New(offlineMsg)) mc.Unknown(&t.Status) // XXX: do we want to set Deployed and Installed conditions as well? - // XXX: should we set conditions to `Unknown`? case "Monitored": mc := monitor.Cond(v1alpha1.BundleDeploymentConditionMonitored) mc.SetError(&t.Status, "Cluster offline", errors.New(offlineMsg)) diff --git a/internal/cmd/controller/clustermonitor/monitor_test.go b/internal/cmd/controller/clustermonitor/monitor_test.go index 5a2106d865..adb3bd412f 100644 --- a/internal/cmd/controller/clustermonitor/monitor_test.go +++ b/internal/cmd/controller/clustermonitor/monitor_test.go @@ -42,7 +42,7 @@ func (m BDStatusMatcher) Matches(x interface{}) bool { if cond.Type == "Ready" { foundReady = true - if cond.Status != "False" || + if cond.Status != "Unknown" || !strings.Contains(cond.Reason, "offline") || !strings.Contains(cond.Message, "offline") { return false From fdae320ecb037055ce3519211e12c2ceee19ea80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Fri, 18 Oct 2024 16:50:22 +0200 Subject: [PATCH 12/14] Fix json attribute for cluster monitor interval This should fix Fleet controller deployments complaining about the interval being 0 when it should never be. --- internal/config/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/config/config.go b/internal/config/config.go index 2d3e47b509..e7cd672aa8 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -123,7 +123,7 @@ type Config struct { AgentWorkers AgentWorkers `json:"agentWorkers,omitempty"` // ClusterMonitorInterval determines how often the cluster monitor will check for offline downstream clusters. - ClusterMonitorInterval metav1.Duration `json:"clusterMonitorInterval.omitempty"` + ClusterMonitorInterval metav1.Duration `json:"clusterMonitorInterval,omitempty"` // ClusterMonitorThreshold determines how long must have elapsed since a downstream cluster's Fleet agent last // reported its status to the management cluster, before that downstream cluster is considered offline. From 2aaefd86b008de729e36d9dd015f2587976b6e8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Fri, 18 Oct 2024 18:38:17 +0200 Subject: [PATCH 13/14] Run cluster status monitor on unsharded controller only Running one cluster status monitor per Fleet controller pod is not necessary and may cause conflicts in sharded setups. --- internal/cmd/controller/operator.go | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/internal/cmd/controller/operator.go b/internal/cmd/controller/operator.go index e20f31439d..084db3ee98 100644 --- a/internal/cmd/controller/operator.go +++ b/internal/cmd/controller/operator.go @@ -178,16 +178,18 @@ func start( return err } - setupLog.Info("starting cluster status monitor") - cfg := fleetcfg.Get() - // No need to run a similar check on the threshold, since its minimum value will be a multiple of the agent check-in - // interval anyway. - if cfg.ClusterMonitorInterval.Seconds() == 0 { - err := errors.New("cluster status monitor interval cannot be 0") - setupLog.Error(err, "cannot start cluster status monitor") - return err + if shardID == "" { // only one instance of the cluster status monitor needs to run. + setupLog.Info("starting cluster status monitor") + cfg := fleetcfg.Get() + // No need to run a similar check on the threshold, since its minimum value will be a multiple of the agent check-in + // interval anyway. + if cfg.ClusterMonitorInterval.Seconds() == 0 { + err := errors.New("cluster status monitor interval cannot be 0") + setupLog.Error(err, "cannot start cluster status monitor") + return err + } + go clustermonitor.Run(ctx, mgr.GetClient(), cfg.ClusterMonitorInterval.Duration, cfg.ClusterMonitorThreshold.Duration) } - go clustermonitor.Run(ctx, mgr.GetClient(), cfg.ClusterMonitorInterval.Duration, cfg.ClusterMonitorThreshold.Duration) setupLog.Info("starting job scheduler") jobCtx, cancel := context.WithCancel(ctx) From 9f8db1c9df5bcf40ba3c8033dae2ac48da6257b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Mon, 21 Oct 2024 11:43:02 +0200 Subject: [PATCH 14/14] Add agent check-in interval to agent install tests Omitting the agent check-in interval when patching the `fleet-controller` config map would now lead to errors when setting up agents with a check-in interval bearing the default value for a duration, ie 0s. That interval is now set with a hard-coded value, which is of no importance for such tests, for the sake of not being zero. --- e2e/multi-cluster/installation/agent_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/e2e/multi-cluster/installation/agent_test.go b/e2e/multi-cluster/installation/agent_test.go index 5bea51fdd2..d9aca41365 100644 --- a/e2e/multi-cluster/installation/agent_test.go +++ b/e2e/multi-cluster/installation/agent_test.go @@ -27,8 +27,9 @@ var _ = Describe("Fleet installation with TLS agent modes", func() { "cattle-fleet-system", "--type=merge", "-p", + // Agent check-in interval cannot be 0. Any other value will do here. fmt.Sprintf( - `{"data":{"config":"{\"apiServerURL\": \"https://google.com\", \"apiServerCA\": \"\", \"agentTLSMode\": \"%s\"}"}}`, + `{"data":{"config":"{\"apiServerURL\": \"https://google.com\", \"apiServerCA\": \"\", \"agentTLSMode\": \"%s\", \"agentCheckinInterval\": \"1m\"}"}}`, agentMode, ), )