Skip to content

Commit 0c7fb27

Browse files
authored
[nexus] Allow stopping Failed instances (#6652)
PR #6503 changed Nexus to attempt to automatically restart instances which are in the `Failed` state. Now that we do this, we should probably change the allowable instance state transitions to permit a user to stop an instance that is `Failed`, as a way to say "stop trying to restart this instance" (as `Stopped` instances are not restarted). This branch changes `Nexus::instance_request_state` and `select_instance_change_action` to permit stopping a `Failed` instance. Fixes #6640 Fixes #2825, along with #6455 (which allowed restarting `Failed` instances).
1 parent dfe628a commit 0c7fb27

File tree

2 files changed

+92
-4
lines changed

2 files changed

+92
-4
lines changed

nexus/src/app/instance.rs

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,10 @@ enum InstanceStateChangeRequestAction {
233233
/// Request the appropriate state change from the sled with the specified
234234
/// UUID.
235235
SendToSled { sled_id: SledUuid, propolis_id: PropolisUuid },
236+
237+
/// The instance is not currently incarnated on a sled, so just update its
238+
/// runtime state in the database without communicating with a sled-agent.
239+
UpdateRuntime(db::model::InstanceRuntimeState),
236240
}
237241

238242
/// What is the higher level operation that is calling
@@ -811,13 +815,39 @@ impl super::Nexus {
811815
still being created"
812816
))
813817
}
818+
// Failed instances may transition to Stopped by just changing
819+
// the Nexus state in the database to NoVmm.
820+
//
821+
// An instance's effective state will never be Failed while it
822+
// is linked with a VMM. If the instance has an active VMM which
823+
// is Failed, the instance's effective state will be Stopping,
824+
// rather than Failed, until an instance-update saga has
825+
// unlinked the Failed VMM. We can guarantee this is the case,
826+
// as a database CHECK constraint will not permit an instance
827+
// with an active Propolis ID to be Failed. Therefore, we know
828+
// that a Failed instance is definitely not incarnated on a
829+
// sled, so all we need to do to "stop" it is to update its
830+
// state in the database.
831+
InstanceState::Failed if matches!(requested, InstanceStateChangeRequest::Stop) => {
832+
// As discussed above, this shouldn't happen, so return an
833+
// internal error and complain about it in the logs.
834+
if vmm_state.is_some() {
835+
return Err(Error::internal_error(
836+
"an instance should not be in the Failed \
837+
effective state if it has an active VMM"
838+
));
839+
}
814840

841+
let prev_runtime = instance_state.runtime();
842+
return Ok(InstanceStateChangeRequestAction::UpdateRuntime(db::model::InstanceRuntimeState {
843+
time_updated: chrono::Utc::now(),
844+
r#gen: prev_runtime.r#gen.0.next().into(),
845+
nexus_state: db::model::InstanceState::NoVmm,
846+
..prev_runtime.clone()
847+
}));
848+
}
815849
// If the instance has no sled beacuse it's been destroyed or
816850
// has fallen over, reject the state change.
817-
//
818-
// TODO(#2825): Failed instances should be allowed to stop, but
819-
// this requires a special action because there is no sled to
820-
// send the request to.
821851
InstanceState::Failed | InstanceState::Destroyed => {
822852
return Err(Error::invalid_request(&format!(
823853
"instance state cannot be changed from {}",
@@ -898,6 +928,26 @@ impl super::Nexus {
898928
&requested,
899929
)? {
900930
InstanceStateChangeRequestAction::AlreadyDone => Ok(()),
931+
InstanceStateChangeRequestAction::UpdateRuntime(new_runtime) => {
932+
let instance_id =
933+
InstanceUuid::from_untyped_uuid(prev_instance_state.id());
934+
let changed = self
935+
.datastore()
936+
.instance_update_runtime(&instance_id, &new_runtime)
937+
.await
938+
.map_err(InstanceStateChangeError::Other)?;
939+
if !changed {
940+
// TODO(eliza): perhaps we should refetch the instance here
941+
// and return Ok if it was in the desired state...
942+
Err(InstanceStateChangeError::Other(Error::conflict(
943+
"The instance was previously in a state that allowed \
944+
the requested state change, but the instance's state \
945+
changed before the request could be completed",
946+
)))
947+
} else {
948+
Ok(())
949+
}
950+
}
901951
InstanceStateChangeRequestAction::SendToSled {
902952
sled_id,
903953
propolis_id,

nexus/tests/integration_tests/instances.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1177,6 +1177,44 @@ async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_deleted(
11771177
expect_instance_delete_ok(client, instance_name).await;
11781178
}
11791179

1180+
// Verifies that if a request to reboot or stop an instance fails because of a
1181+
// 404 error from sled agent, then the instance moves to the Failed state, and
1182+
// can then be Stopped once it has transitioned to Failed.
1183+
#[nexus_test]
1184+
async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_stopped(
1185+
cptestctx: &ControlPlaneTestContext,
1186+
) {
1187+
let client = &cptestctx.external_client;
1188+
1189+
let instance_name = "losing-is-fun";
1190+
let instance_id = make_forgotten_instance(
1191+
&cptestctx,
1192+
instance_name,
1193+
InstanceAutoRestartPolicy::Never,
1194+
)
1195+
.await;
1196+
1197+
// Attempting to reboot the forgotten instance will result in a 404
1198+
// NO_SUCH_INSTANCE from the sled-agent, which Nexus turns into a 503.
1199+
expect_instance_reboot_fail(
1200+
client,
1201+
instance_name,
1202+
http::StatusCode::SERVICE_UNAVAILABLE,
1203+
)
1204+
.await;
1205+
1206+
// Wait for the instance to transition to Failed.
1207+
instance_wait_for_state(client, instance_id, InstanceState::Failed).await;
1208+
1209+
// Now, it should be possible to stop the instance.
1210+
let instance_next =
1211+
instance_post(&client, instance_name, InstanceOp::Stop).await;
1212+
assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped);
1213+
1214+
// Now, the Stopped nstance should be deleteable..
1215+
expect_instance_delete_ok(client, instance_name).await;
1216+
}
1217+
11801218
// Verifies that the instance-watcher background task transitions an instance
11811219
// to Failed when the sled-agent returns a 404, and that the instance can be
11821220
// deleted after it transitions to Failed.

0 commit comments

Comments
 (0)