-
Notifications
You must be signed in to change notification settings - Fork 211
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add a drain timeout #351
add a drain timeout #351
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,7 @@ var ( | |
|
||
// Command line flags | ||
period time.Duration | ||
drainTimeout time.Duration | ||
dsNamespace string | ||
dsName string | ||
lockAnnotation string | ||
|
@@ -56,6 +57,7 @@ var ( | |
messageTemplateReboot string | ||
podSelectors []string | ||
rebootCommand string | ||
isDrained bool | ||
|
||
rebootDays []string | ||
rebootStart string | ||
|
@@ -93,6 +95,8 @@ func main() { | |
|
||
rootCmd.PersistentFlags().DurationVar(&period, "period", time.Minute*60, | ||
"sentinel check period") | ||
rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0, | ||
"drain timeout") | ||
rootCmd.PersistentFlags().StringVar(&dsNamespace, "ds-namespace", "kube-system", | ||
"namespace containing daemonset on which to place lock") | ||
rootCmd.PersistentFlags().StringVar(&dsName, "ds-name", "kured", | ||
|
@@ -315,7 +319,7 @@ func release(lock *daemonsetlock.DaemonSetLock) { | |
} | ||
} | ||
|
||
func drain(client *kubernetes.Clientset, node *v1.Node) { | ||
func drain(client *kubernetes.Clientset, node *v1.Node) bool { | ||
nodename := node.GetName() | ||
|
||
log.Infof("Draining node %s", nodename) | ||
|
@@ -339,14 +343,17 @@ func drain(client *kubernetes.Clientset, node *v1.Node) { | |
IgnoreAllDaemonSets: true, | ||
ErrOut: os.Stderr, | ||
Out: os.Stdout, | ||
Timeout: drainTimeout, | ||
} | ||
if err := kubectldrain.RunCordonOrUncordon(drainer, node, true); err != nil { | ||
log.Fatalf("Error cordonning %s: %v", nodename, err) | ||
} | ||
|
||
if err := kubectldrain.RunNodeDrain(drainer, nodename); err != nil { | ||
log.Fatalf("Error draining %s: %v", nodename, err) | ||
log.Error("Error draining %s: %v", nodename, err) | ||
return false | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is another PR that grew a bit, and is touching this, maybe it's worth sharing efforts? Can you have a look at https://github.com/weaveworks/kured/pull/341/files ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I maybe missed something, but I think it doens't uncordon the nodes after the timeout in the #341 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct. Here, I think error handling can be indeed improved. We can change the function signature to return the error and maybe the result. If an error happens, log it. If forceReboot isn't true, we should probably uncordon and continue the main loop. Else, the control flow continues. That makes it far clearer to read, IMO. |
||
} | ||
return true | ||
} | ||
|
||
func uncordon(client *kubernetes.Clientset, node *v1.Node) { | ||
|
@@ -531,11 +538,21 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s | |
continue | ||
} | ||
|
||
drain(client, node) | ||
invokeReboot(nodeID, rebootCommand) | ||
for { | ||
log.Infof("Waiting for reboot") | ||
time.Sleep(time.Minute) | ||
isDrained = drain(client, node) | ||
if isDrained { | ||
invokeReboot(nodeID, rebootCommand) | ||
for { | ||
log.Infof("Waiting for reboot") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should probably be |
||
time.Sleep(time.Minute) | ||
} | ||
} else { | ||
log.Infof("Uncordon %s", node.GetName()) | ||
uncordon(client, node) | ||
deleteFlag := newCommand("/usr/bin/nsenter", "-m/proc/1/ns/mnt", "/bin/rm", rebootSentinelFile) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be best to use the available functions for wrapping things properly (so that it works on all OSes). Next, what about the cases where the sentinel is a command instead of a sentinel file? |
||
log.Infof("Delete flag : %s", rebootSentinelFile) | ||
if err := deleteFlag.Run(); err != nil { | ||
log.Fatalf("Error invoking reboot command: %v", err) | ||
} | ||
} | ||
} | ||
} | ||
|
@@ -587,6 +604,11 @@ func root(cmd *cobra.Command, args []string) { | |
log.Info("Lock TTL not set, lock will remain until being released") | ||
} | ||
log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName) | ||
if drainTimeout > 0 { | ||
log.Infof("Drain timeout set, node will uncordon after: %v", drainTimeout) | ||
} else { | ||
log.Info("Drain timeout not set, node will try to drain until it's drained") | ||
} | ||
log.Infof("Blocking Pod Selectors: %v", podSelectors) | ||
log.Infof("Reboot schedule: %v", window) | ||
log.Infof("Reboot check command: %s every %v", sentinelCommand, period) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should probably be Errorf