Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(sns): Do not abort SNS upgrade due to recoverable state corruption #2614

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 45 additions & 44 deletions rs/sns/governance/src/governance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2892,7 +2892,7 @@ impl Governance {

if self.env.now() > mark_failed_at_seconds {
let error = format!(
"Upgrade marked as failed at {} seconds from genesis. \
"Upgrade marked as failed at timestamp {} seconds. \
Did not find an upgrade in the ledger's canister_info recent_changes.",
self.env.now(),
);
Expand Down Expand Up @@ -5426,63 +5426,64 @@ impl Governance {

let deployed_version = match self.proto.deployed_version.clone() {
None => {
let error = format!(
"Upgrade marked as failed at {} seconds from genesis. \
Governance had no recorded deployed_version. \
Setting it to currently running version and failing upgrade.",
let message = format!(
"SNS Governance had no recorded deployed_version at timestamp {} seconds. \
Setting it to currently running {:?} and attempting to proceed.",
self.env.now(),
running_version,
);

self.push_to_upgrade_journal(upgrade_journal_entry::UpgradeOutcome::invalid_state(
error.to_string(),
message.to_string(),
None,
));

self.proto.deployed_version = Some(running_version);
self.fail_sns_upgrade_to_next_version_proposal(
proposal_id,
GovernanceError::new_with_message(ErrorType::PreconditionFailed, error),
);
return;
self.proto.deployed_version = Some(running_version.clone());

running_version.clone()
}
Some(version) => version,
};

let expected_changes = deployed_version.changes_against(&target_version);
let expected_changes = {
let expected_changes = deployed_version.changes_against(&target_version);
running_version.version_has_expected_hashes(&expected_changes)
};

match running_version.version_has_expected_hashes(&expected_changes) {
Ok(_) => {
log!(
INFO,
"Upgrade marked successful at {} from genesis. New Version: {:?}",
if let Err(errs) = expected_changes {
if self.env.now() > mark_failed_at {
let message = format!(
"Upgrade marked as failed at timestamp {} seconds. \
Running system version does not match expected state:\n- {:?}",
self.env.now(),
target_version
errs.join("- {}\n"),
);
self.push_to_upgrade_journal(upgrade_journal_entry::UpgradeOutcome::success(None));
self.set_proposal_execution_status(proposal_id, Ok(()));
self.proto.deployed_version = Some(target_version);
self.proto.pending_version = None;
}
Err(errors) => {
// We are past mark_failed_at_seconds.
if self.env.now() > mark_failed_at {
let error = format!(
"Upgrade marked as failed at {} seconds from genesis. \
Running system version does not match expected state.\n{:?}",
self.env.now(),
errors
);

self.push_to_upgrade_journal(upgrade_journal_entry::UpgradeOutcome::timeout(
error.to_string(),
));
self.fail_sns_upgrade_to_next_version_proposal(
proposal_id,
GovernanceError::new_with_message(ErrorType::External, error),
);
}
self.push_to_upgrade_journal(upgrade_journal_entry::UpgradeOutcome::timeout(
message.to_string(),
));
self.fail_sns_upgrade_to_next_version_proposal(
proposal_id,
GovernanceError::new_with_message(ErrorType::External, message),
);
}

// Returning here because (1) the expected changes were not observed yet and (2) either
// the upgrade has timed out or there will be another attempt in the next periodic task.
return;
}

log!(
INFO,
"Upgrade marked successful at timestamp {} seconds, new {:?}",
self.env.now(),
target_version
);
self.push_to_upgrade_journal(upgrade_journal_entry::UpgradeOutcome::success(None));
self.set_proposal_execution_status(proposal_id, Ok(()));

self.proto.deployed_version = Some(target_version);
self.proto.pending_version = None;
}

// This method sets internal state to remove pending_version and sets the proposal status to
Expand Down Expand Up @@ -8421,9 +8422,9 @@ mod tests {
GovernanceError::new_with_message(
ErrorType::PreconditionFailed,
format!(
"Upgrade marked as failed at {} seconds from genesis. \
Governance had no recorded deployed_version. \
Setting it to currently running version and failing upgrade.",
"Upgrade marked as failed at timestamp {} seconds. \
Governance had no recorded deployed_version. \
Setting it to currently running version and failing upgrade.",
now
)
)
Expand Down
Loading