Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove safeToStartCycle check #91

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/golangci-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ jobs:
- name: golangci-lint
uses: golangci/golangci-lint-action@v6
with:
version: v1.59
version: v1.63
args: --timeout=5m
24 changes: 10 additions & 14 deletions cmd/observer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ type app struct {
cloudProviderName *string
namespace *string
addr *string
prometheusAddress *string
dryMode *bool
runImmediately *bool
runOnce *bool
Expand All @@ -50,18 +49,16 @@ type app struct {
// newApp creates a new app and sets up the cobra flags
func newApp(rootCmd *cobra.Command) *app {
return &app{
addr: rootCmd.PersistentFlags().String("addr", ":8080", "Address to listen on for /metrics"),
cloudProviderName: rootCmd.PersistentFlags().String("cloud-provider", "aws", "Which cloud provider to use, options: [aws]"),
namespaces: rootCmd.PersistentFlags().StringSlice("namespaces", []string{"kube-system"}, "Namespaces to watch for cycle request objects"),
namespace: rootCmd.PersistentFlags().String("namespace", "kube-system", "Namespaces to watch and create cnrs"),
dryMode: rootCmd.PersistentFlags().Bool("dry", false, "api-server drymode for applying CNRs"),
waitInterval: rootCmd.PersistentFlags().Duration("wait-interval", 2*time.Minute, "duration to wait after detecting changes before creating CNR objects. The window for letting changes on nodegroups settle before starting rotation"),
checkInterval: rootCmd.PersistentFlags().Duration("check-interval", 5*time.Minute, `duration interval to check for changes. e.g. run the loop every 5 minutes"`),
nodeStartupTime: rootCmd.PersistentFlags().Duration("node-startup-time", 2*time.Minute, "duration to wait after a cluster-autoscaler scaleUp event is detected"),
runImmediately: rootCmd.PersistentFlags().Bool("now", false, "makes the check loop run straight away on program start rather than wait for the check interval to elapse"),
runOnce: rootCmd.PersistentFlags().Bool("once", false, "run the check loop once then exit. also works with --now"),
prometheusAddress: rootCmd.PersistentFlags().String("prometheus-address", "prometheus", "Prometheus service address used to query cluster-autoscaler metrics"),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

discuss: is it worth allowing --prometheus-address to remain as a flags but be unused, in order to avoid erroring out people's installations when they automatically bump the version, or is it better for the version bump to immediately show this feature isn't used any more by this error message stopping it starting?

docker run --rm -it ghcr.io/atlassian-labs/cyclops:v1.10.1 cyclops --prometheus-address 127.0.0.1:8080
cyclops: error: unknown long flag '--prometheus-address', try --help

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think i prefer to remove the flag immediately because having a flag that does nothing can be confusing. leaving it would also require us to remember to remove it later. since this is already a breaking change, i think it's better to alert users to the change in behaviour right away with the flag removal, rather than silently ignoring it.

prometheusScrapeInterval: rootCmd.PersistentFlags().Duration("prometheus-scrape-interval", 40*time.Second, "Prometheus scrape interval used to detect change of value from prometheus query, needed to detect scaleUp event"),
addr: rootCmd.PersistentFlags().String("addr", ":8080", "Address to listen on for /metrics"),
cloudProviderName: rootCmd.PersistentFlags().String("cloud-provider", "aws", "Which cloud provider to use, options: [aws]"),
namespaces: rootCmd.PersistentFlags().StringSlice("namespaces", []string{"kube-system"}, "Namespaces to watch for cycle request objects"),
namespace: rootCmd.PersistentFlags().String("namespace", "kube-system", "Namespaces to watch and create cnrs"),
dryMode: rootCmd.PersistentFlags().Bool("dry", false, "api-server drymode for applying CNRs"),
waitInterval: rootCmd.PersistentFlags().Duration("wait-interval", 2*time.Minute, "duration to wait after detecting changes before creating CNR objects. The window for letting changes on nodegroups settle before starting rotation"),
checkInterval: rootCmd.PersistentFlags().Duration("check-interval", 5*time.Minute, `duration interval to check for changes. e.g. run the loop every 5 minutes"`),
nodeStartupTime: rootCmd.PersistentFlags().Duration("node-startup-time", 2*time.Minute, "duration to wait after a cluster-autoscaler scaleUp event is detected"),
runImmediately: rootCmd.PersistentFlags().Bool("now", false, "makes the check loop run straight away on program start rather than wait for the check interval to elapse"),
runOnce: rootCmd.PersistentFlags().Bool("once", false, "run the check loop once then exit. also works with --now"),
}
}

Expand Down Expand Up @@ -126,7 +123,6 @@ func (a *app) run() {
RunOnce: *a.runOnce,
WaitInterval: *a.waitInterval,
NodeStartupTime: *a.nodeStartupTime,
PrometheusAddress: *a.prometheusAddress,
PrometheusScrapeInterval: *a.prometheusScrapeInterval,
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func (t *CycleNodeRequestTransitioner) transitionUndefined() (reconcile.Result,
validationErrors := validation.IsDNS1035Label(t.cycleNodeRequest.Name)

if len(validationErrors) > 0 {
return t.transitionToFailed(fmt.Errorf(strings.Join(validationErrors, ",")))
return t.transitionToFailed(fmt.Errorf("%s", strings.Join(validationErrors, ",")))
}

// Transition the object to pending
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/cyclenodestatus/transitioner/transitions.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func (t *CycleNodeStatusTransitioner) transitionDraining() (reconcile.Result, er
// Fail with all of the combined encountered errors if we got any. If we failed inside the loop we would
// potentially miss some important information in the logs.
if len(unexpectedErrors) > 0 {
return t.transitionToFailed(fmt.Errorf(strings.Join(unexpectedErrors, "\n")))
return t.transitionToFailed(fmt.Errorf("%s", strings.Join(unexpectedErrors, "\n")))
}
// No serious errors were encountered. If we're done, move on.
if finished {
Expand Down
91 changes: 0 additions & 91 deletions pkg/observer/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,18 @@ package observer

import (
"context"
"errors"
"net/http"
"sort"
"strconv"
"time"

"github.com/cenkalti/backoff/v4"
"github.com/prometheus/client_golang/api"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/prometheus/common/model"
corev1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/client"

v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1"
"github.com/atlassian-labs/cyclops/pkg/generation"
"github.com/atlassian-labs/cyclops/pkg/k8s"
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
)

var apiVersion = "undefined" //nolint:golint,varcheck,deadcode,unused
Expand Down Expand Up @@ -277,66 +271,6 @@ func (c *controller) dropInProgressNodeGroups(nodeGroups v1.NodeGroupList, cnrs
return restingNodeGroups
}

// get the cluster-autoscaler last scaleUp activity time
func stringToTime(s string) (time.Time, error) {
sec, err := strconv.ParseInt(s, 10, 64)
if err != nil {
return time.Time{}, err
}
return time.Unix(sec, 0), nil
}

// query cluster-autoscaler metrics to figure out if it's safe to start a new CNR
func (c *controller) safeToStartCycle() bool {
client, err := api.NewClient(api.Config{
Address: c.PrometheusAddress,
})
if err != nil {
// Prometheus might not be installed in the cluster. return true if it can't connect
klog.Errorln("Error creating client:", err)
return true
}

v1api := promv1.NewAPI(client)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// scaleDown metric is updated every cycle cluster-autoscaler is checking if the cluster should scaleDown
// scaleDown does not get checked and therefore not updated when the cluster is scaling up since no check for scaleDown is needed
result, warnings, err := v1api.Query(ctx, "cluster_autoscaler_last_activity{activity='scaleDown'}", time.Now())
if err != nil {
// cluster-autoscaler might not be installed in the cluster. return true if it can't find the metrics of run the query
klog.Errorln("Error querying Prometheus:", err)
return true
}
if len(warnings) > 0 {
klog.Errorln("Warnings:", warnings)
}

v := result.(model.Vector)
// cluster-autoscaler should always gives a response if it's active
if v.Len() == 0 {
klog.Errorln("Empty response from prometheus")
return true
}

scaleUpTime := v[v.Len()-1].Value.String()
t, err := stringToTime(scaleUpTime)
if err != nil {
klog.Errorln("Error converting the time:", err)
return false
}

// cluster_autoscaler_last_activity values will update every PrometheusScrapeInterval in non-scaling scenario
lastScaleEvent := time.Since(t)
if lastScaleEvent > c.PrometheusScrapeInterval {
klog.Infoln("Scale up event recently happened")
return false
}
klog.V(3).Infoln("No scale up event")

return true
}

// createCNRs generates and applies CNRs from the changedNodeGroups
func (c *controller) createCNRs(changedNodeGroups []*ListedNodeGroups) {
klog.V(3).Infoln("applying")
Expand Down Expand Up @@ -371,26 +305,6 @@ func (c *controller) nextRunTime() time.Time {
return time.Now().UTC().Add(c.CheckInterval)
}

func (c *controller) checkIfSafeToStartCycle() bool {
b := backoff.NewExponentialBackOff()
b.MaxElapsedTime = 120 * time.Second

err := backoff.Retry(func() error {
if !c.safeToStartCycle() {
klog.Error("Cluster autoscaler scaleUp event in progress. Retry...")
return errors.New("cluster-autoscaler event in progress")
}
return nil
}, b)

if err != nil {
klog.Errorln("there are still cluster-autoscaler scaleUp events")
return false
}

return true
}

// Run runs the controller loops once. detecting lock, changes, and applying CNRs
// implements cron.Job interface
func (c *controller) Run() {
Expand Down Expand Up @@ -420,11 +334,6 @@ func (c *controller) Run() {
}
}

// query cluster-autoscaler to check if it's safe to start a new CNR
if !c.checkIfSafeToStartCycle() {
return
}

// wait for the desired amount to allow any in progress changes to batch up
klog.V(3).Infof("waiting for %v to allow changes to settle", c.WaitInterval)
select {
Expand Down
7 changes: 3 additions & 4 deletions pkg/observer/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@ type Controller interface {

// Options contains the options config for a controller
type Options struct {
CNRPrefix string
Namespace string
CheckSchedule string
PrometheusAddress string
CNRPrefix string
Namespace string
CheckSchedule string

DryMode bool
RunImmediately bool
Expand Down
Loading