Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow instance_reconfigure to resize a stopped instance #6774

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
4 changes: 3 additions & 1 deletion common/src/api/external/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,9 @@ impl InstanceState {
}

/// The number of CPUs in an Instance
#[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)]
#[derive(
Copy, Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq,
)]
pub struct InstanceCpuCount(pub u16);

impl TryFrom<i64> for InstanceCpuCount {
Expand Down
4 changes: 4 additions & 0 deletions nexus/db-model/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -542,4 +542,8 @@ pub struct InstanceUpdate {
/// set the instance's auto-restart policy to `NULL`.
#[diesel(column_name = auto_restart_policy)]
pub auto_restart_policy: Option<InstanceAutoRestartPolicy>,

pub ncpus: InstanceCpuCount,

pub memory: ByteCount,
hawkw marked this conversation as resolved.
Show resolved Hide resolved
}
10 changes: 9 additions & 1 deletion nexus/db-model/src/instance_cpu_count.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,15 @@ use serde::Serialize;
use std::convert::TryFrom;

#[derive(
Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize,
Copy,
Clone,
Debug,
AsExpression,
FromSqlRow,
Serialize,
Deserialize,
PartialEq,
Eq,
)]
#[diesel(sql_type = sql_types::BigInt)]
pub struct InstanceCpuCount(pub external::InstanceCpuCount);
Expand Down
5 changes: 5 additions & 0 deletions nexus/db-model/src/instance_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ impl_enum_type!(
);

impl InstanceState {
/// Instance states where there is not currently a VMM incarnating this
/// instance, but might be in the future.
pub const NO_VMM_STATES: &'static [InstanceState] =
&[InstanceState::NoVmm, InstanceState::Creating, InstanceState::Failed];

pub fn state(&self) -> external::InstanceState {
external::InstanceState::from(*self)
}
Expand Down
173 changes: 113 additions & 60 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ use crate::db::error::public_error_from_diesel;
use crate::db::error::ErrorHandler;
use crate::db::identity::Resource;
use crate::db::lookup::LookupPath;
use crate::db::model::ByteCount;
use crate::db::model::Generation;
use crate::db::model::Instance;
use crate::db::model::InstanceAutoRestart;
use crate::db::model::InstanceAutoRestartPolicy;
use crate::db::model::InstanceCpuCount;
use crate::db::model::InstanceRuntimeState;
use crate::db::model::InstanceState;
use crate::db::model::InstanceUpdate;
Expand Down Expand Up @@ -1038,8 +1040,12 @@ impl DataStore {
.transaction_retry_wrapper("reconfigure_instance")
.transaction(&conn, |conn| {
let err = err.clone();
let InstanceUpdate { boot_disk_id, auto_restart_policy } =
update.clone();
let InstanceUpdate {
boot_disk_id,
auto_restart_policy,
ncpus,
memory,
} = update.clone();
async move {
// Set the auto-restart policy.
diesel::update(instance_dsl::instance)
Expand All @@ -1051,11 +1057,21 @@ impl DataStore {
.execute_async(&conn)
.await?;

// Set vCPUs and memory size.
self.instance_set_size_on_conn(
&conn,
&err,
&authz_instance,
ncpus,
memory,
)
.await?;

// Next, set the boot disk if needed.
self.instance_set_boot_disk_on_conn(
&conn,
&err,
authz_instance,
&authz_instance,
boot_disk_id,
)
.await?;
Expand All @@ -1077,14 +1093,16 @@ impl DataStore {
Ok(instance_and_vmm)
}

/// Set the boot disk on an instance, bypassing the rest of an instance
/// update. You probably don't need this; it's only used at the end of
/// instance creation, since the boot disk can't be set until the new
/// instance's disks are all attached.
pub async fn instance_set_boot_disk(
&self,
opctx: &OpContext,
authz_instance: &authz::Instance,
boot_disk_id: Option<Uuid>,
) -> Result<(), Error> {
opctx.authorize(authz::Action::Modify, authz_instance).await?;

let err = OptionalError::new();
let conn = self.pool_connection_authorized(opctx).await?;
self.transaction_retry_wrapper("instance_set_boot_disk")
Expand Down Expand Up @@ -1128,48 +1146,9 @@ impl DataStore {
use db::schema::disk::dsl as disk_dsl;
use db::schema::instance::dsl as instance_dsl;

// * Allow setting the boot disk in NoVmm because there is no VMM to
// contend with.
// * Allow setting the boot disk in Failed to allow changing the boot
// disk of a failed instance and free its boot disk for detach.
// * Allow setting the boot disk in Creating because one of the last
// steps of instance creation, while the instance is still in
// Creating, is to reconfigure the instance to the desired boot disk.
Comment on lines -1131 to -1137
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe worth keeping a comment similar to this one, but moving it to the filter(instance_dsl::state.eq_any(NOT_INCARNATED_STATES)) line? not a big deal though --- I do think it's nice to document someplace in here "we can't set the boot disk if the instance is incarnated because, you know..."

const OK_TO_SET_BOOT_DISK_STATES: &'static [InstanceState] = &[
InstanceState::NoVmm,
InstanceState::Failed,
InstanceState::Creating,
];

let maybe_instance = instance_dsl::instance
.filter(instance_dsl::id.eq(authz_instance.id()))
.filter(instance_dsl::time_deleted.is_null())
.select(Instance::as_select())
.first_async::<Instance>(conn)
.await;
let instance = match maybe_instance {
Ok(i) => i,
Err(diesel::NotFound) => {
// If the instance simply doesn't exist, we
// shouldn't retry. Bail with a useful error.
return Err(err.bail(Error::not_found_by_id(
ResourceType::Instance,
&authz_instance.id(),
)));
}
Err(e) => return Err(e),
};

// If the desired boot disk is already set, we're good here, and can
// elide the check that the instance is in an acceptable state to change
// the boot disk.
if instance.boot_disk_id == boot_disk_id {
return Ok(());
}

if let Some(disk_id) = boot_disk_id {
// Ensure the disk is currently attached before updating
// the database.
// Ensure the disk is currently attached before updating the
// database.
let expected_state =
api::external::DiskState::Attached(authz_instance.id());

Expand All @@ -1188,28 +1167,102 @@ impl DataStore {
);
}
}
//
// NOTE: from this point forward it is OK if we update the
// instance's `boot_disk_id` column with the updated value
// again. It will have already been assigned with constraint
// checking performed above, so updates will just be
// repetitive, not harmful.

let r = diesel::update(instance_dsl::instance)
let query = diesel::update(instance_dsl::instance)
.into_boxed()
.filter(instance_dsl::id.eq(authz_instance.id()))
.filter(instance_dsl::state.eq_any(OK_TO_SET_BOOT_DISK_STATES))
.filter(instance_dsl::state.eq_any(InstanceState::NO_VMM_STATES));
let query = if boot_disk_id.is_some() {
query.filter(
instance_dsl::boot_disk_id
.ne(boot_disk_id)
.or(instance_dsl::boot_disk_id.is_null()),
)
} else {
query.filter(instance_dsl::boot_disk_id.is_not_null())
hawkw marked this conversation as resolved.
Show resolved Hide resolved
};

let r = query
.set(instance_dsl::boot_disk_id.eq(boot_disk_id))
.check_if_exists::<Instance>(authz_instance.id())
.execute_and_check(&conn)
.await?;
match r.status {
UpdateStatus::NotUpdatedButExists => {
// This should be the only reason the query would fail...
debug_assert!(!OK_TO_SET_BOOT_DISK_STATES
.contains(&r.found.runtime().nexus_state));
Err(err.bail(Error::conflict(
"instance must be stopped to set boot disk",
)))
if r.found.boot_disk_id == boot_disk_id {
// Not updated, because the update is no change..
return Ok(());
}

if !InstanceState::NO_VMM_STATES
.contains(&r.found.runtime().nexus_state)
{
return Err(err.bail(Error::conflict(
"instance must be stopped to set boot disk",
)));
}
hawkw marked this conversation as resolved.
Show resolved Hide resolved

// There should be no other reason the update fails on an
// existing instance.
return Err(err.bail(Error::internal_error(
"unable to reconfigure instance boot disk",
)));
Comment on lines +1208 to +1212
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it might be worth adding more details to the internal message for this error, since it indicates pretty clearly that there was a bug --- it would be nice if the instance record was logged if this ever did happen? not a blocker though.

}
UpdateStatus::Updated => Ok(()),
}
}

/// Set an instance's CPU count and memory size to the provided values,
/// within an existing transaction.
///
/// Does not allow setting sizes of running instances to ensure that if an
/// instance is running, its resource reservation matches what we record in
/// the database.
Comment on lines +1221 to +1223
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpicky, not a big deal: maybe explicitly state that Error::Conflict is returned in that case?

async fn instance_set_size_on_conn(
&self,
conn: &async_bb8_diesel::Connection<DbConnection>,
err: &OptionalError<Error>,
authz_instance: &authz::Instance,
ncpus: InstanceCpuCount,
memory: ByteCount,
) -> Result<(), diesel::result::Error> {
use db::schema::instance::dsl as instance_dsl;

let r = diesel::update(instance_dsl::instance)
.filter(instance_dsl::id.eq(authz_instance.id()))
.filter(instance_dsl::state.eq_any(InstanceState::NO_VMM_STATES))
.filter(
instance_dsl::ncpus
.ne(ncpus)
.or(instance_dsl::memory.ne(memory)),
)
.set((
instance_dsl::ncpus.eq(ncpus),
instance_dsl::memory.eq(memory),
))
.check_if_exists::<Instance>(authz_instance.id())
.execute_and_check(&conn)
.await?;
match r.status {
UpdateStatus::NotUpdatedButExists => {
if (r.found.ncpus, r.found.memory) == (ncpus, memory) {
// Not updated, because the update is no change..
return Ok(());
}

if !InstanceState::NO_VMM_STATES
.contains(&r.found.runtime().nexus_state)
{
return Err(err.bail(Error::conflict(
"instance must be stopped to be resized",
)));
}

// There should be no other reason the update fails on an
// existing instance.
return Err(err.bail(Error::internal_error(
"unable to reconfigure instance size",
)));
Comment on lines +1267 to +1271
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similarly, this indicates some kind of bug in nexus, so it might be nice to record additional information about it, in case we end up debugging this in the future? not a big deal though.

}
UpdateStatus::Updated => Ok(()),
}
Expand Down
Loading
Loading