Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Manager switching upon quorum loss #155

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 25 additions & 24 deletions .github/workflows/docker-tests-8.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,30 +40,31 @@ jobs:
strategy:
matrix:
command:
- 'VERSION=8.0 GODOG_FEATURE=active_nodes.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=async.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=async_setting.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=cascade_replicas.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=CLI.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=crash_recovery.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=events_reenable.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=external_replication.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=failover.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=free_space.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=host_discovery.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=host_management.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=maintenance.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=offline_mode.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=priority.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=readonly_filesystem.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=recovery.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=repair.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=repl_mon.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=statefile.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=switchover_from.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=switchover_to.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=zk_failure.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=zk_maintenance.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=active_nodes.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=async.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=async_setting.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=cascade_replicas.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=CLI.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=crash_recovery.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=events_reenable.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=external_replication.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=failover.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=free_space.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=host_discovery.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=host_management.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=maintenance.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=offline_mode.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=priority.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=readonly_filesystem.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=recovery.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=repair.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=repl_mon.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=statefile.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=switchover_from.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=switchover_to.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=zk_failure.feature make test'
# - 'VERSION=8.0 GODOG_FEATURE=zk_maintenance.feature make test'
- 'VERSION=8.0 GODOG_FEATURE=manager_switchover.feature make test'
fail-fast: false

steps:
Expand Down
45 changes: 23 additions & 22 deletions .github/workflows/docker-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,28 +40,29 @@ jobs:
strategy:
matrix:
command:
- 'GODOG_FEATURE=active_nodes.feature make test'
- 'GODOG_FEATURE=async.feature make test'
- 'GODOG_FEATURE=cascade_replicas.feature make test'
- 'GODOG_FEATURE=CLI.feature make test'
- 'GODOG_FEATURE=crash_recovery.feature make test'
- 'GODOG_FEATURE=events_reenable.feature make test'
- 'GODOG_FEATURE=failover.feature make test'
- 'GODOG_FEATURE=free_space.feature make test'
- 'GODOG_FEATURE=host_discovery.feature make test'
- 'GODOG_FEATURE=host_management.feature make test'
- 'GODOG_FEATURE=maintenance.feature make test'
- 'GODOG_FEATURE=offline_mode.feature make test'
- 'GODOG_FEATURE=priority.feature make test'
- 'GODOG_FEATURE=readonly_filesystem.feature make test'
- 'GODOG_FEATURE=recovery.feature make test'
- 'GODOG_FEATURE=repair.feature make test'
- 'GODOG_FEATURE=repl_mon.feature make test'
- 'GODOG_FEATURE=statefile.feature make test'
- 'GODOG_FEATURE=switchover_from.feature make test'
- 'GODOG_FEATURE=switchover_to.feature make test'
- 'GODOG_FEATURE=zk_failure.feature make test'
- 'GODOG_FEATURE=zk_maintenance.feature make test'
# - 'GODOG_FEATURE=active_nodes.feature make test'
# - 'GODOG_FEATURE=async.feature make test'
# - 'GODOG_FEATURE=cascade_replicas.feature make test'
# - 'GODOG_FEATURE=CLI.feature make test'
# - 'GODOG_FEATURE=crash_recovery.feature make test'
# - 'GODOG_FEATURE=events_reenable.feature make test'
# - 'GODOG_FEATURE=failover.feature make test'
# - 'GODOG_FEATURE=free_space.feature make test'
# - 'GODOG_FEATURE=host_discovery.feature make test'
# - 'GODOG_FEATURE=host_management.feature make test'
# - 'GODOG_FEATURE=maintenance.feature make test'
# - 'GODOG_FEATURE=offline_mode.feature make test'
# - 'GODOG_FEATURE=priority.feature make test'
# - 'GODOG_FEATURE=readonly_filesystem.feature make test'
# - 'GODOG_FEATURE=recovery.feature make test'
# - 'GODOG_FEATURE=repair.feature make test'
# - 'GODOG_FEATURE=repl_mon.feature make test'
# - 'GODOG_FEATURE=statefile.feature make test'
# - 'GODOG_FEATURE=switchover_from.feature make test'
# - 'GODOG_FEATURE=switchover_to.feature make test'
# - 'GODOG_FEATURE=zk_failure.feature make test'
# - 'GODOG_FEATURE=zk_maintenance.feature make test'
- 'GODOG_FEATURE=manager_switchover.feature make test'
fail-fast: false

steps:
Expand Down
2 changes: 1 addition & 1 deletion .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ linters:
- bodyclose
- dupl
- errcheck
- exportloopref
- copyloopvar
- funlen
- gocritic
- gocyclo
Expand Down
131 changes: 125 additions & 6 deletions internal/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
replRepairState map[string]*ReplicationRepairState
externalReplication mysql.IExternalReplication
switchHelper mysql.ISwitchHelper
lostQuorumTime time.Time
}

// NewApp returns new App. Suddenly.
Expand Down Expand Up @@ -484,7 +485,7 @@
return stateFirstRun
}
app.dcs.Initialize()
if app.dcs.AcquireLock(pathManagerLock) {
if app.AcquireLock(pathManagerLock) {
return stateManager
}
return stateCandidate
Expand Down Expand Up @@ -564,7 +565,7 @@
return stateMaintenance
}
if err == dcs.ErrNotFound || maintenance.ShouldLeave {
if app.dcs.AcquireLock(pathManagerLock) {
if app.AcquireLock(pathManagerLock) {
app.logger.Info("leaving maintenance")
err := app.leaveMaintenance()
if err != nil {
Expand All @@ -586,7 +587,7 @@
if !app.dcs.IsConnected() {
return stateLost
}
if !app.dcs.AcquireLock(pathManagerLock) {
if !app.AcquireLock(pathManagerLock) {
return stateCandidate
}

Expand All @@ -607,6 +608,15 @@
return stateManager
}

if app.config.ManagerSwitchover {
managerSeeMaster, err := app.checkMasterVisible(clusterState, clusterStateDcs)
if err == nil && !managerSeeMaster {
if state, ok := app.checkQuorum(clusterState, clusterStateDcs); !ok {
return state
}
}
}

// master is master host that should be on cluster
master, err := app.getCurrentMaster(clusterState)
if err != nil {
Expand Down Expand Up @@ -752,6 +762,115 @@
return stateManager
}

func (app *App) checkMasterVisible(clusterStateFromDB, clusterStateDcs map[string]*NodeState) (bool, error) {
masterHost, err := app.getMasterHost(clusterStateDcs)
if err != nil {
app.logger.Errorf("checkMasterVisible: can`t get muster host, error: %s", err)
return false, err
}
state, ok := clusterStateFromDB[masterHost]

Check failure on line 771 in internal/app/app.go

View workflow job for this annotation

GitHub Actions / lint

File is not properly formatted (gofmt)
app.logger.Debugf("master(%s) state pingOk == %s", masterHost, state)
if ok && state.PingOk {
app.logger.Debug("Master is visible by manager, than we don`t need manager`s switchover")
return true, nil
}

retryPingOk, err := app.cluster.PingNode(masterHost)
if err != nil {
app.logger.Errorf("checkMasterVisible: app.cluster.PingNode(%s) fail with error: %s", masterHost, err)
}

return retryPingOk, err
}

func (app *App) checkQuorum(clusterStateFromDB, clusterStateDcs map[string]*NodeState) (appState, bool) {
// Counting the number of HA hosts visible to the manager.
visibleHAHostsCount := 0
workingHANodesCount := 0
HAHosts := app.cluster.HANodeHosts()

for _, host := range HAHosts {
nodeStateFromDB, ok := clusterStateFromDB[host]
if !ok {
app.logger.Errorf("app.cluster.HANodesHosts() returns host %s, which does not exist in app.getClusterStateFromDB()", host)
break
}
pingFromDB := nodeStateFromDB.PingOk

nodeStateFromDcs, ok := clusterStateDcs[host]
if !ok {
app.logger.Errorf("host %s exists in clusterstate from DB, but does not exit in clusterstate from DCS", host)
break
}
pingFromDcs := nodeStateFromDcs.PingOk

if pingFromDcs {
workingHANodesCount++
if pingFromDB {
visibleHAHostsCount++
}
}
}

managerElectionDelayAfterQuorumLoss := app.config.ManagerElectionDelayAfterQuorumLoss

if workingHANodesCount > 0 && visibleHAHostsCount <= (workingHANodesCount-1)/2 {
app.logger.Infof("manager lost quorum (%d/%d visible HAHosts)", visibleHAHostsCount, workingHANodesCount)
lostQuorumDuration := time.Since(app.lostQuorumTime)

if app.lostQuorumTime.IsZero() {
app.lostQuorumTime = time.Now()
} else {
// Lost quorum less than 15 (default WaitingRecoveryNetworkTimestamp) seconds ago
// Just wait manager recover connection
if lostQuorumDuration <= managerElectionDelayAfterQuorumLoss {
app.logger.Warnf("Quorum loss ongoing (%0.2fs): manager wait for network recovery", lostQuorumDuration.Seconds())
// Lost quorum more than 15 (default WaitingRecoveryNetworkTimestamp) seconds ago
// Manager should release lock and dont acquire lock for 45 (default ManagerElectionDelayAfterQuorumLoss) seconds
} else if lostQuorumDuration > managerElectionDelayAfterQuorumLoss {
app.logger.Warnf("Quorum loss ongoing (%0.2fs): manager release lock", lostQuorumDuration.Seconds())
app.dcs.ReleaseLock(pathManagerLock)
return stateCandidate, false
}
}
} else {
app.logger.Debugf("manager sees %d HAHosts of %d (manager has quorum), then the manager should not release the lock", visibleHAHostsCount, workingHANodesCount)
app.lostQuorumTime = time.Time{}
}

return "", true
}

func (app *App) AcquireLock(path string) bool {
if app.lostQuorumTime.IsZero() {
return app.dcs.AcquireLock(path)
}

managerElectionDelayAfterQuorumLoss := app.config.ManagerElectionDelayAfterQuorumLoss
managerLockAcquireDelayAfterQuorumLoss := app.config.ManagerLockAcquireDelayAfterQuorumLoss

lostQuorumDuration := time.Since(app.lostQuorumTime)
if lostQuorumDuration < managerElectionDelayAfterQuorumLoss {
app.logger.Debug("manager try to acquire lock")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think about providing more information regarding the quorum situation in this line? When we have a non-zero lostQuorumDuration, it might be helpful to highlight that in the application logs. I suggest adding something like In the period of lost quorum <DURATION> as a suffix to the logger text, as I believe it would make the situation more clear to the reader.

return app.dcs.AcquireLock(path)
} else if lostQuorumDuration <= managerElectionDelayAfterQuorumLoss+managerLockAcquireDelayAfterQuorumLoss {
// Manager cant AcquireLock in delay
app.logger.Debugf(
"Quorum loss ongoing (%0.2fs): manager lock acquisition blocked (%0.2fs/%0.2fs cooldown)",
lostQuorumDuration.Seconds(),
lostQuorumDuration.Seconds()-managerElectionDelayAfterQuorumLoss.Seconds(),
managerLockAcquireDelayAfterQuorumLoss.Seconds(),
)
return false
// Manager start to try to AcquireLock
} else if lostQuorumDuration > app.config.ManagerElectionDelayAfterQuorumLoss+app.config.ManagerLockAcquireDelayAfterQuorumLoss {
app.lostQuorumTime = time.Time{}
return app.dcs.AcquireLock(path)
}

return false
}

func (app *App) approveFailover(clusterState, clusterStateDcs map[string]*NodeState, activeNodes []string, master string) error {
if !app.config.Failover {
return fmt.Errorf("auto_failover is disabled in config")
Expand Down Expand Up @@ -819,7 +938,7 @@
if maintenance != nil && maintenance.MySyncPaused {
return stateMaintenance
}
if app.dcs.AcquireLock(pathManagerLock) {
if app.AcquireLock(pathManagerLock) {
return stateManager
}
return stateCandidate
Expand Down Expand Up @@ -1291,7 +1410,7 @@
}

// setting server read-only may take a while so we need to ensure we are still a manager
if !app.dcs.AcquireLock(pathManagerLock) || app.emulateError("set_read_only_lost_lock") {
if !app.AcquireLock(pathManagerLock) || app.emulateError("set_read_only_lost_lock") {
return errors.New("manger lock lost during switchover, new manager should finish the process, leaving")
}

Expand Down Expand Up @@ -1362,7 +1481,7 @@
newMaster, mostRecent, app.config.SlaveCatchUpTimeout)
}
// catching up may take a while so we need to ensure we are still a manager
if !app.dcs.AcquireLock(pathManagerLock) || app.emulateError("catchup_lost_lock") {
if !app.AcquireLock(pathManagerLock) || app.emulateError("catchup_lost_lock") {
return errors.New("manger lock lost during switchover, new manager should finish the process, leaving")
}
app.logger.Infof("switchover: new master %s caught up", newMaster)
Expand Down
6 changes: 6 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ type Config struct {
InfoFileHandlerInterval time.Duration `config:"info_file_handler_interval" yaml:"info_file_handler_interval"`
RecoveryCheckInterval time.Duration `config:"recoverycheck_interval" yaml:"recoverycheck_interval"`
ExternalCAFileCheckInterval time.Duration `config:"external_ca_file_check_interval" yaml:"external_ca_file_check_interval"`
ManagerElectionDelayAfterQuorumLoss time.Duration `config:"manager_election_delay_after_quorum_loss" yaml:"manager_election_delay_after_quorum_loss"`
ManagerLockAcquireDelayAfterQuorumLoss time.Duration `config:"manager_lock_acquire_delay_after_quorum_loss" yaml:"manager_lock_acquire_delay_after_quorum_loss"`
MaxAcceptableLag float64 `config:"max_acceptable_lag" yaml:"max_acceptable_lag"`
SlaveCatchUpTimeout time.Duration `config:"slave_catch_up_timeout" yaml:"slave_catch_up_timeout"`
DisableSemiSyncReplicationOnMaintenance bool `config:"disable_semi_sync_replication_on_maintenance" yaml:"disable_semi_sync_replication_on_maintenance"`
Expand Down Expand Up @@ -97,6 +99,7 @@ type Config struct {
ReplMonErrorWaitInterval time.Duration `config:"repl_mon_error_wait_interval" yaml:"repl_mon_error_wait_interval"`
ReplMonSlaveWaitInterval time.Duration `config:"repl_mon_slave_wait_interval" yaml:"repl_mon_slave_wait_interval"`
ShowOnlyGTIDDiff bool `config:"show_only_gtid_diff" yaml:"show_only_gtid_diff"`
ManagerSwitchover bool `config:"manager_switchover" yaml:"manager_switchover"`
ForceSwitchover bool `config:"force_switchover" yaml:"force_switchover"` // TODO: Remove when we will be sure it's right way to do switchover
}

Expand Down Expand Up @@ -154,6 +157,8 @@ func DefaultConfig() (Config, error) {
InfoFileHandlerInterval: 30 * time.Second,
RecoveryCheckInterval: 5 * time.Second,
ExternalCAFileCheckInterval: 5 * time.Second,
ManagerElectionDelayAfterQuorumLoss: 30 * time.Second, // need more than 15 sec
ManagerLockAcquireDelayAfterQuorumLoss: 45 * time.Second,
MaxAcceptableLag: 60.0,
SlaveCatchUpTimeout: 30 * time.Minute,
DisableSemiSyncReplicationOnMaintenance: true,
Expand Down Expand Up @@ -183,6 +188,7 @@ func DefaultConfig() (Config, error) {
ReplMonErrorWaitInterval: 10 * time.Second,
ReplMonSlaveWaitInterval: 10 * time.Second,
ShowOnlyGTIDDiff: false,
ManagerSwitchover: false,
ForceSwitchover: false,
}
return config, nil
Expand Down
Loading
Loading