Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow instance_reconfigure to resize a stopped instance #6774

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
4 changes: 3 additions & 1 deletion common/src/api/external/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,9 @@ impl InstanceState {
}

/// The number of CPUs in an Instance
#[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)]
#[derive(
Copy, Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq,
)]
pub struct InstanceCpuCount(pub u16);

impl TryFrom<i64> for InstanceCpuCount {
Expand Down
4 changes: 4 additions & 0 deletions nexus/db-model/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -542,4 +542,8 @@ pub struct InstanceUpdate {
/// set the instance's auto-restart policy to `NULL`.
#[diesel(column_name = auto_restart_policy)]
pub auto_restart_policy: Option<InstanceAutoRestartPolicy>,

pub ncpus: InstanceCpuCount,

pub memory: ByteCount,
hawkw marked this conversation as resolved.
Show resolved Hide resolved
}
10 changes: 9 additions & 1 deletion nexus/db-model/src/instance_cpu_count.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,15 @@ use serde::Serialize;
use std::convert::TryFrom;

#[derive(
Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize,
Copy,
Clone,
Debug,
AsExpression,
FromSqlRow,
Serialize,
Deserialize,
PartialEq,
Eq,
)]
#[diesel(sql_type = sql_types::BigInt)]
pub struct InstanceCpuCount(pub external::InstanceCpuCount);
Expand Down
103 changes: 96 additions & 7 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ use crate::db::error::public_error_from_diesel;
use crate::db::error::ErrorHandler;
use crate::db::identity::Resource;
use crate::db::lookup::LookupPath;
use crate::db::model::ByteCount;
use crate::db::model::Generation;
use crate::db::model::Instance;
use crate::db::model::InstanceAutoRestart;
use crate::db::model::InstanceAutoRestartPolicy;
use crate::db::model::InstanceCpuCount;
use crate::db::model::InstanceRuntimeState;
use crate::db::model::InstanceState;
use crate::db::model::InstanceUpdate;
Expand Down Expand Up @@ -1038,8 +1040,12 @@ impl DataStore {
.transaction_retry_wrapper("reconfigure_instance")
.transaction(&conn, |conn| {
let err = err.clone();
let InstanceUpdate { boot_disk_id, auto_restart_policy } =
update.clone();
let InstanceUpdate {
boot_disk_id,
auto_restart_policy,
ncpus,
memory,
} = update.clone();
async move {
// Set the auto-restart policy.
diesel::update(instance_dsl::instance)
Expand All @@ -1051,6 +1057,16 @@ impl DataStore {
.execute_async(&conn)
.await?;

// Set vCPUs and memory size.
self.instance_set_size_on_conn(
&conn,
&err,
authz_instance,
ncpus,
memory,
)
.await?;
iximeow marked this conversation as resolved.
Show resolved Hide resolved

// Next, set the boot disk if needed.
self.instance_set_boot_disk_on_conn(
&conn,
Expand Down Expand Up @@ -1141,13 +1157,13 @@ impl DataStore {
InstanceState::Creating,
];

let maybe_instance = instance_dsl::instance
let maybe_old_boot_disk_id = instance_dsl::instance
.filter(instance_dsl::id.eq(authz_instance.id()))
.filter(instance_dsl::time_deleted.is_null())
.select(Instance::as_select())
.first_async::<Instance>(conn)
.select(instance_dsl::boot_disk_id)
.first_async::<Option<Uuid>>(conn)
.await;
let instance = match maybe_instance {
let old_boot_disk_id = match maybe_old_boot_disk_id {
Ok(i) => i,
Err(diesel::NotFound) => {
// If the instance simply doesn't exist, we
Expand All @@ -1163,7 +1179,7 @@ impl DataStore {
// If the desired boot disk is already set, we're good here, and can
// elide the check that the instance is in an acceptable state to change
// the boot disk.
if instance.boot_disk_id == boot_disk_id {
if old_boot_disk_id == boot_disk_id {
return Ok(());
}

Expand Down Expand Up @@ -1215,6 +1231,79 @@ impl DataStore {
}
}

/// Set an instance's CPU count and memory size to the provided values,
/// within an existing transaction.
async fn instance_set_size_on_conn(
&self,
conn: &async_bb8_diesel::Connection<DbConnection>,
err: &OptionalError<Error>,
authz_instance: &authz::Instance,
ncpus: InstanceCpuCount,
memory: ByteCount,
) -> Result<(), diesel::result::Error> {
use db::schema::instance::dsl as instance_dsl;

// Do not allow setting sizes of running instances to ensure that if an
// instance is running, its resource usage matches what we record in the
// database.
const OK_TO_SET_SIZE_STATES: &'static [InstanceState] = &[
InstanceState::NoVmm,
InstanceState::Failed,
InstanceState::Creating,
];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It occurs to me that OK_TO_SET_SIZE_STATES and OK_TO_SET_BOOT_DISK_STATES are the same list of states, and what we really want in both cases is "states in which there is no VMM actively incarnating this instance". I kind of wonder if, rather than duplicating in both the instance-resize and instance-set-boot-disk queries, we should just have a constant on InstanceState that's like NOT_INCARNATED_STATES or similar, so that these queries and any future ones can all use the same set of states, and if we add new states that have "not incarnated" semantics, we can add them to the one definition of that list of states, instead.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on one hand i agree (and enough to go just do the thing, there's now InstanceState::NO_VMM_STATES*), but from the conversations we've had we've been pretty close to having caveouts for specific not-incarnated states and fields. namely "is it OK to set the boot disk in Creating", and does allowing that allow correctness issues (no) or a possibly-confusing-but-correct API result (yes). otoh "actions we can take on not-incarnated instances" is a much easier concept than having to think through each kind of action and state combination, so it's nicer in an axiomatic way.

*: after writing this i realize why you said NOT_INCARNATED rather than NO_VMM - because there is a NoVmm state which is very distinct from what this list is getting at. so i should go change that name.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, my thinking is that we should avoid having weird exceptions for setting specific parameters unless it's absolutely necessary. I'd be quite surprised if we don't basically just end up being able to divide every possible instance_reconfigure into either "can do this only if there is no Real Life VMM Process incarnating the instance" and "can do this whenever", and I'd prefer to have to burn that bridge only once we come to it...


let maybe_old_sizes = instance_dsl::instance
.filter(instance_dsl::id.eq(authz_instance.id()))
.filter(instance_dsl::time_deleted.is_null())
.select((instance_dsl::ncpus, instance_dsl::memory))
.first_async::<(InstanceCpuCount, ByteCount)>(conn)
.await;
iximeow marked this conversation as resolved.
Show resolved Hide resolved

// Might be nice to extract `old_sizes` into an `InstanceDimensions` or
// something? Not processing the tuple `(InstanceCpuCount, ByteCount)`
// elsewhere yet, so maybe that would be premature...
let old_sizes = match maybe_old_sizes {
Ok(i) => i,
Err(diesel::NotFound) => {
// If the instance simply doesn't exist, we
// shouldn't retry. Bail with a useful error.
return Err(err.bail(Error::not_found_by_id(
ResourceType::Instance,
&authz_instance.id(),
)));
}
Err(e) => return Err(e),
};

// Bail out early if the extant vCPU and memory settings match the
// "new values".
if old_sizes == (ncpus, memory) {
return Ok(());
}

let r = diesel::update(instance_dsl::instance)
.filter(instance_dsl::id.eq(authz_instance.id()))
.filter(instance_dsl::state.eq_any(OK_TO_SET_SIZE_STATES))
.set((
instance_dsl::ncpus.eq(ncpus),
instance_dsl::memory.eq(memory),
))
.check_if_exists::<Instance>(authz_instance.id())
.execute_and_check(&conn)
.await?;
match r.status {
UpdateStatus::NotUpdatedButExists => {
// This should be the only reason the query would fail...
debug_assert!(!OK_TO_SET_SIZE_STATES
.contains(&r.found.runtime().nexus_state));
Err(err.bail(Error::conflict(
"instance must be stopped to be resized",
)))
}
UpdateStatus::Updated => Ok(()),
}
}

pub async fn project_delete_instance(
&self,
opctx: &OpContext,
Expand Down
104 changes: 60 additions & 44 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ use omicron_common::api::external::CreateResult;
use omicron_common::api::external::DataPageParams;
use omicron_common::api::external::DeleteResult;
use omicron_common::api::external::Error;
use omicron_common::api::external::InstanceCpuCount;
use omicron_common::api::external::InstanceState;
use omicron_common::api::external::InternalContext;
use omicron_common::api::external::ListResultVec;
Expand Down Expand Up @@ -311,6 +312,8 @@ impl super::Nexus {
let (.., authz_project, authz_instance) =
instance_lookup.lookup_for(authz::Action::Modify).await?;

check_instance_cpu_memory_sizes(params.ncpus, params.memory)?;

let boot_disk_id = match params.boot_disk.clone() {
Some(disk) => {
let selector = params::DiskSelector {
Expand All @@ -331,8 +334,11 @@ impl super::Nexus {
};

let auto_restart_policy = params.auto_restart_policy.map(Into::into);
let ncpus = params.ncpus.into();
let memory = params.memory.into();

let update = InstanceUpdate { boot_disk_id, auto_restart_policy };
let update =
InstanceUpdate { boot_disk_id, auto_restart_policy, ncpus, memory };
self.datastore()
.instance_reconfigure(opctx, &authz_instance, update)
.await
Expand All @@ -347,6 +353,8 @@ impl super::Nexus {
let (.., authz_project) =
project_lookup.lookup_for(authz::Action::CreateChild).await?;

check_instance_cpu_memory_sizes(params.ncpus, params.memory)?;

let all_disks: Vec<&params::InstanceDiskAttachment> =
params.boot_disk.iter().chain(params.disks.iter()).collect();

Expand All @@ -363,12 +371,6 @@ impl super::Nexus {
.await?;
}
}
if params.ncpus.0 > MAX_VCPU_PER_INSTANCE {
return Err(Error::invalid_request(&format!(
"cannot have more than {} vCPUs per instance",
MAX_VCPU_PER_INSTANCE
)));
}
if params.external_ips.len() > MAX_EXTERNAL_IPS_PER_INSTANCE {
return Err(Error::invalid_request(&format!(
"An instance may not have more than {} external IP addresses",
Expand Down Expand Up @@ -413,43 +415,6 @@ impl super::Nexus {
}
}

// Reject instances where the memory is not at least
// MIN_MEMORY_BYTES_PER_INSTANCE
if params.memory.to_bytes() < u64::from(MIN_MEMORY_BYTES_PER_INSTANCE) {
return Err(Error::invalid_value(
"size",
format!(
"memory must be at least {}",
ByteCount::from(MIN_MEMORY_BYTES_PER_INSTANCE)
),
));
}

// Reject instances where the memory is not divisible by
// MIN_MEMORY_BYTES_PER_INSTANCE
if (params.memory.to_bytes() % u64::from(MIN_MEMORY_BYTES_PER_INSTANCE))
!= 0
{
return Err(Error::invalid_value(
"size",
format!(
"memory must be divisible by {}",
ByteCount::from(MIN_MEMORY_BYTES_PER_INSTANCE)
),
));
}

// Reject instances where the memory is greater than the limit
if params.memory.to_bytes() > MAX_MEMORY_BYTES_PER_INSTANCE {
return Err(Error::invalid_value(
"size",
format!(
"memory must be less than or equal to {}",
ByteCount::try_from(MAX_MEMORY_BYTES_PER_INSTANCE).unwrap()
),
));
}

let actor = opctx.authn.actor_required().internal_context(
"loading current user's ssh keys for new Instance",
)?;
Expand Down Expand Up @@ -2187,6 +2152,57 @@ pub(crate) async fn process_vmm_update(
}
}

/// Determines whether the supplied instance sizes (CPU count and memory size)
/// are acceptable.
fn check_instance_cpu_memory_sizes(
ncpus: InstanceCpuCount,
memory: ByteCount,
) -> Result<(), Error> {
if ncpus.0 > MAX_VCPU_PER_INSTANCE {
return Err(Error::invalid_request(&format!(
"cannot have more than {} vCPUs per instance",
MAX_VCPU_PER_INSTANCE
)));
}

// Reject instances where the memory is not at least
// MIN_MEMORY_BYTES_PER_INSTANCE
if memory.to_bytes() < u64::from(MIN_MEMORY_BYTES_PER_INSTANCE) {
return Err(Error::invalid_value(
"size",
format!(
"memory must be at least {}",
ByteCount::from(MIN_MEMORY_BYTES_PER_INSTANCE)
),
));
}

// Reject instances where the memory is not divisible by
// MIN_MEMORY_BYTES_PER_INSTANCE
if (memory.to_bytes() % u64::from(MIN_MEMORY_BYTES_PER_INSTANCE)) != 0 {
return Err(Error::invalid_value(
"size",
format!(
"memory must be divisible by {}",
ByteCount::from(MIN_MEMORY_BYTES_PER_INSTANCE)
),
));
}

// Reject instances where the memory is greater than the limit
if memory.to_bytes() > MAX_MEMORY_BYTES_PER_INSTANCE {
return Err(Error::invalid_value(
"size",
format!(
"memory must be less than or equal to {}",
ByteCount::try_from(MAX_MEMORY_BYTES_PER_INSTANCE).unwrap()
),
));
}

Ok(())
}

/// Determines the disposition of a request to start an instance given its state
/// (and its current VMM's state, if it has one) in the database.
fn instance_start_allowed(
Expand Down
2 changes: 2 additions & 0 deletions nexus/tests/integration_tests/endpoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,8 @@ pub static DEMO_INSTANCE_UPDATE: Lazy<params::InstanceUpdate> =
Lazy::new(|| params::InstanceUpdate {
boot_disk: None,
auto_restart_policy: None,
ncpus: InstanceCpuCount(1),
memory: ByteCount::from_gibibytes_u32(16),
});

// The instance needs a network interface, too.
Expand Down
Loading
Loading