Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add Context to Run method of the manager #757

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmd/machine-controller-manager-cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import (
"bytes"
"flag"
"fmt"
"io/ioutil"
"log"
"os"

"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
"github.com/gardener/machine-controller-manager/pkg/driver"
Expand Down Expand Up @@ -119,7 +119,7 @@ func main() {

// Read function decodes the yaml file passed to it
func Read(fileName string, decodedObj interface{}) error {
m, err := ioutil.ReadFile(fileName)
m, err := os.ReadFile(fileName)
if err != nil {
log.Fatalf("Could not read %s: %s", fileName, err)
}
Expand Down
78 changes: 42 additions & 36 deletions cmd/machine-controller-manager/app/controllermanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"os"
goruntime "runtime"
"strconv"
"sync"
"time"

machinescheme "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/scheme"
Expand Down Expand Up @@ -73,7 +74,7 @@ var (
)

// Run runs the MCMServer. This should never exit.
func Run(s *options.MCMServer) error {
func Run(ctx context.Context, s *options.MCMServer) error {
// To help debugging, immediately log version
klog.V(4).Infof("Version: %+v", version.Get())
if err := s.Validate(); err != nil {
Expand Down Expand Up @@ -126,8 +127,9 @@ func Run(s *options.MCMServer) error {

recorder := createRecorder(kubeClientControl)

run := func(ctx context.Context) {
var stop <-chan struct{}
waitGroup := sync.WaitGroup{}
waitGroup.Add(1)
startControllers := func(ctx context.Context) {
// Control plane client used to interact with machine APIs
controlMachineClientBuilder := machinecontroller.SimpleClientBuilder{
ClientConfig: controlkubeconfig,
Expand All @@ -141,25 +143,23 @@ func Run(s *options.MCMServer) error {
ClientConfig: targetkubeconfig,
}

err := StartControllers(
if err := StartControllers(
ctx,
s,
controlkubeconfig,
targetkubeconfig,
controlMachineClientBuilder,
controlCoreClientBuilder,
targetCoreClientBuilder,
recorder,
stop,
)

klog.Fatalf("error running controllers: %v", err)
panic("unreachable")

); err != nil {
klog.Fatalf("failed to start controllers: %v", err)
}
waitGroup.Done()
}

if !s.LeaderElection.LeaderElect {
run(nil)
panic("unreachable")
startControllers(ctx)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on sending Ctrl-C in case when mcm runs with leader-elect disabled, the leader elect is attempted again

^CI1109 12:23:33.015676   54619 controller.go:525] Shutting down Machine Controller Manager 
I1109 12:23:33.015729   54619 reflector.go:225] Stopping reflector *v1.Node (12h0m0s) from k8s.io/client-go/informers/factory.go:134
.
.
.
I1109 12:23:33.016592   54619 reflector.go:225] Stopping reflector *v1alpha1.OpenStackMachineClass (12h0m0s) from github.com/gardener/machine-controller-manager/pkg/client/informers/externalversions/factory.go:117
I1109 12:23:35.016078   54619 leaderelection.go:248] attempting to acquire leader lease shoot--i544024--ca-test/machine-controller-manager...
E1109 12:23:35.016100   54619 controllermanager.go:193] leaderelection lost

a return here would help

}

id, err := os.Hostname()
Expand All @@ -182,32 +182,34 @@ func Run(s *options.MCMServer) error {
klog.Fatalf("error creating lock: %v", err)
}

ctx := context.TODO()
leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{
Lock: rl,
LeaseDuration: s.LeaderElection.LeaseDuration.Duration,
RenewDeadline: s.LeaderElection.RenewDeadline.Duration,
RetryPeriod: s.LeaderElection.RetryPeriod.Duration,
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: run,
OnStartedLeading: startControllers,
OnStoppedLeading: func() {
klog.Fatalf("leaderelection lost")
klog.Errorf("leaderelection lost")
waitGroup.Wait()
Comment on lines +193 to +194
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't we exit immediately after losing leader election ? why wait?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just wanted to ensure that all the worker processes finish here. Otherwise you will see something like that:

...
^CE1108 14:24:59.627336   35435 controllermanager.go:193] leaderelection lost
I1108 14:24:59.627378   35435 controller.go:556] Stopped worker for type: ClusterMachineSet
I1108 14:24:59.627390   35435 controller.go:556] Stopped worker for type: ClusterMachineSafetyOvershooting
I1108 14:24:59.627397   35435 controller.go:556] Stopped worker for type: ClusterMachineAPIServer

The controller stops in the middle of the shutdown.

I contrast to that the waitGroupWait() result looks like the following:

...
I1108 14:25:28.146883   35495 controller.go:556] Stopped worker for type: ClusterPacketMachineClass
I1108 14:25:28.146889   35495 controller.go:556] Stopped worker for type: ClusterMachineSafetyOvershooting
I1108 14:25:28.146931   35495 reflector.go:225] Stopping reflector *v1alpha1.MachineSet (12h0m0s) from pkg/client/informers/externalversions/factory.go:117
I1108 14:25:28.146980   35495 reflector.go:225] Stopping reflector *v1alpha1.OpenStackMachineClass (12h0m0s) from pkg/client/informers/externalversions/factory.go:117
I1108 14:25:28.147009   35495 reflector.go:225] Stopping reflector *v1.PersistentVolume (12h0m0s) from k8s.io/client-go/informers/factory.go:134
I1108 14:25:28.147077   35495 reflector.go:225] Stopping reflector *v1alpha1.AzureMachineClass (12h0m0s) from pkg/client/informers/externalversions/factory.go:117
I1108 14:25:28.147168   35495 reflector.go:225] Stopping reflector *v1alpha1.MachineDeployment (12h0m0s) from pkg/client/informers/externalversions/factory.go:117
I1108 14:25:28.147219   35495 reflector.go:225] Stopping reflector *v1alpha1.AWSMachineClass (12h0m0s) from pkg/client/informers/externalversions/factory.go:117
I1108 14:25:28.147238   35495 reflector.go:225] Stopping reflector *v1alpha1.AlicloudMachineClass (12h0m0s) from pkg/client/informers/externalversions/factory.go:117
I1108 14:25:28.151506   35495 controller.go:525] Shutting down Machine Controller Manager 

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay, but this is for the case where an interrupt is sent to the leader process, which leads to shutdown of workers, but in case when the leader fails to renew lease(due to API latency for example) then there won't be shutdown of the workers right? So we'll keep on waiting?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

another case which I noticed , is when its attempting to acquire leader lease and we send a Ctrl-C then it keeps waiting forever as there are no workers, ideally it should exit with just one interrupt.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One solution could be:

OnStoppedLeading: func() {
   klog.Errorf("leaderelection lost")
   select{
     case <-ctx.Done():
     default:
       syscall.Kill(syscall.Getpid(), syscall.SIGINT)
   }
   waitGroup.Wait()
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leadership can be lost due to various reasons. Would it not be better if the all controllers that are started when acquiring leadership are subsequently stopped when losing a leadership. There is no need for exiting. leaderelection.RunOrDie blocks unless and until its context is cancelled at which point it will return and the Run method will complete its job. This would typically happen when that instance is going down in which case there is nothing more to do.

I might have missed something, so please feel free to point that out.

},
},
})
panic("unreachable")

return nil
}

// StartControllers starts all the controllers which are a part of machine-controller-manager
func StartControllers(s *options.MCMServer,
func StartControllers(
ctx context.Context,
s *options.MCMServer,
controlCoreKubeconfig *rest.Config,
targetCoreKubeconfig *rest.Config,
controlMachineClientBuilder machinecontroller.ClientBuilder,
controlCoreClientBuilder corecontroller.ClientBuilder,
targetCoreClientBuilder corecontroller.ClientBuilder,
recorder record.EventRecorder,
stop <-chan struct{}) error {

) error {
klog.V(5).Info("Getting available resources")
availableResources, err := getAvailableResources(controlCoreClientBuilder)
if err != nil {
Expand All @@ -231,18 +233,16 @@ func StartControllers(s *options.MCMServer,
if availableResources[machineGVR] || availableResources[machineSetGVR] || availableResources[machineDeploymentGVR] {
klog.V(5).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod)

controlMachineInformerFactory := machineinformers.NewFilteredSharedInformerFactory(
controlMachineInformerFactory := machineinformers.NewSharedInformerFactoryWithOptions(
controlMachineClientBuilder.ClientOrDie("control-machine-shared-informers"),
s.MinResyncPeriod.Duration,
s.Namespace,
nil,
machineinformers.WithNamespace(s.Namespace),
)

controlCoreInformerFactory := coreinformers.NewFilteredSharedInformerFactory(
controlCoreInformerFactory := coreinformers.NewSharedInformerFactoryWithOptions(
controlCoreClientBuilder.ClientOrDie("control-core-shared-informers"),
s.MinResyncPeriod.Duration,
s.Namespace,
nil,
coreinformers.WithNamespace(s.Namespace),
)

targetCoreInformerFactory := coreinformers.NewSharedInformerFactory(
Expand Down Expand Up @@ -284,22 +284,28 @@ func StartControllers(s *options.MCMServer,
}
klog.V(1).Info("Starting shared informers")

controlMachineInformerFactory.Start(stop)
controlCoreInformerFactory.Start(stop)
targetCoreInformerFactory.Start(stop)
controlMachineInformerFactory.Start(ctx.Done())
controlCoreInformerFactory.Start(ctx.Done())
targetCoreInformerFactory.Start(ctx.Done())

klog.V(5).Info("Running controller")
go mcmcontroller.Run(int(s.ConcurrentNodeSyncs), stop)

var waitGroup sync.WaitGroup
waitGroup.Add(1)
go func() {
mcmcontroller.Run(ctx, int(s.ConcurrentNodeSyncs))
waitGroup.Done()
}()
waitGroup.Wait()
} else {
return fmt.Errorf("unable to start machine controller: API GroupVersion %q or %q or %q is not available; \nFound: %#v", machineGVR, machineSetGVR, machineDeploymentGVR, availableResources)
}

select {}
return nil
}

// TODO: In general, any controller checking this needs to be dynamic so
// users don't have to restart their controller manager if they change the apiserver.
// TODO: In general, any controller checking this needs to be dynamic so users don't have to
// restart their controller manager if they change the apiserver.
//
// Until we get there, the structure here needs to be exposed for the construction of a proper ControllerContext.
func getAvailableResources(clientBuilder corecontroller.ClientBuilder) (map[schema.GroupVersionResource]bool, error) {
var discoveryClient discovery.DiscoveryInterface
Expand Down Expand Up @@ -330,16 +336,16 @@ func getAvailableResources(clientBuilder corecontroller.ClientBuilder) (map[sche
return nil, fmt.Errorf("failed to get api versions from server: %v: %v", healthzContent, err)
}

resourceMap, err := discoveryClient.ServerResources()
_, resources, err := discoveryClient.ServerGroupsAndResources()
if err != nil {
utilruntime.HandleError(fmt.Errorf("unable to get all supported resources from server: %v", err))
}
if len(resourceMap) == 0 {
if len(resources) == 0 {
return nil, fmt.Errorf("unable to get any supported resources from server")
}

allResources := map[schema.GroupVersionResource]bool{}
for _, apiResourceList := range resourceMap {
for _, apiResourceList := range resources {
version, err := schema.ParseGroupVersion(apiResourceList.GroupVersion)
if err != nil {
return nil, err
Expand Down
7 changes: 2 additions & 5 deletions cmd/machine-controller-manager/controller_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ package main

import (
"fmt"
"github.com/gardener/machine-controller-manager/pkg/util/signals"
"os"

"github.com/gardener/machine-controller-manager/cmd/machine-controller-manager/app"
Expand All @@ -38,19 +39,15 @@ import (
)

func main() {

s := options.NewMCMServer()
s.AddFlags(pflag.CommandLine)

flag.InitFlags()
logs.InitLogs()
defer logs.FlushLogs()

// verflag.PrintAndExitIfRequested()

if err := app.Run(s); err != nil {
if err := app.Run(signals.SetupSignalHandler(), s); err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}

}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ require (
golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect
golang.org/x/tools v0.1.5 // indirect
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/appengine v1.6.6 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/genproto v0.0.0-20210602131652-f16073e35f0c // indirect
google.golang.org/grpc v1.38.0 // indirect
google.golang.org/protobuf v1.26.0 // indirect
Expand Down
3 changes: 2 additions & 1 deletion go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -814,8 +814,9 @@ google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7
google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0=
google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc=
google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c=
google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
Expand Down
3 changes: 1 addition & 2 deletions pkg/apis/machine/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ var (
// the code-generation can find it.
SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes)
// AddToScheme is exposed for API installation
AddToScheme = SchemeBuilder.AddToScheme
localSchemeBuilder = &SchemeBuilder
AddToScheme = SchemeBuilder.AddToScheme
)

func addKnownTypes(scheme *runtime.Scheme) error {
Expand Down
25 changes: 0 additions & 25 deletions pkg/apis/machine/v1alpha1/defaults.go

This file was deleted.

3 changes: 1 addition & 2 deletions pkg/controller/alicloudmachineclass.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,7 @@ func (c *controller) alicloudMachineClassDelete(obj interface{}) {

// reconcileClusterAlicloudMachineClassKey reconciles an AlicloudMachineClass due to controller resync
// or an event on the alicloudMachineClass.
func (c *controller) reconcileClusterAlicloudMachineClassKey(key string) error {
ctx := context.Background()
func (c *controller) reconcileClusterAlicloudMachineClassKey(ctx context.Context, key string) error {
_, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
return err
Expand Down
3 changes: 1 addition & 2 deletions pkg/controller/awsmachineclass.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,7 @@ func (c *controller) awsMachineClassDelete(obj interface{}) {

// reconcileClusterAWSMachineClassKey reconciles an AWSMachineClass due to controller resync
// or an event on the awsMachineClass.
func (c *controller) reconcileClusterAWSMachineClassKey(key string) error {
ctx := context.Background()
func (c *controller) reconcileClusterAWSMachineClassKey(ctx context.Context, key string) error {
_, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
return err
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/awsmachineclass_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
Expand Down
3 changes: 1 addition & 2 deletions pkg/controller/azuremachineclass.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,7 @@ func (c *controller) azureMachineClassDelete(obj interface{}) {

// reconcileClusterAzureMachineClassKey reconciles an AzureMachineClass due to controller resync
// or an event on the azureMachineClass.
func (c *controller) reconcileClusterAzureMachineClassKey(key string) error {
ctx := context.Background()
func (c *controller) reconcileClusterAzureMachineClassKey(ctx context.Context, key string) error {
_, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
return err
Expand Down
Loading