[nexus] Allow stopping Failed instances (#6652)

hawkw · web-flow · commit 0c7fb27a3cbb · 2024-09-24T23:35:37.000Z
PR #6503 changed Nexus to attempt to automatically restart instances which are in the `Failed` state. Now that we do this, we should probably change the allowable instance state transitions to permit a user to stop an instance that is `Failed`, as a way to say "stop trying to restart this instance" (as `Stopped` instances are not restarted). This branch changes `Nexus::instance_request_state` and `select_instance_change_action` to permit stopping a `Failed` instance. Fixes #6640 Fixes #2825, along with #6455 (which allowed restarting `Failed` instances).
diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
@@ -233,6 +233,10 @@ enum InstanceStateChangeRequestAction {
     /// Request the appropriate state change from the sled with the specified
     /// UUID.
     SendToSled { sled_id: SledUuid, propolis_id: PropolisUuid },
+
+    /// The instance is not currently incarnated on a sled, so just update its
+    /// runtime state in the database without communicating with a sled-agent.
+    UpdateRuntime(db::model::InstanceRuntimeState),
 }
 
 /// What is the higher level operation that is calling
@@ -811,13 +815,39 @@ impl super::Nexus {
                                 still being created"
                                 ))
                 }
+                // Failed instances may transition to Stopped by just changing
+                // the Nexus state in the database to NoVmm.
+                //
+                // An instance's effective state will never be Failed while it
+                // is linked with a VMM. If the instance has an active VMM which
+                // is Failed, the instance's effective state will be Stopping,
+                // rather than Failed, until an instance-update saga has
+                // unlinked the Failed VMM. We can guarantee this is the case,
+                // as a database CHECK constraint will not permit an instance
+                // with an active Propolis ID to be Failed. Therefore, we know
+                // that a Failed instance is definitely not incarnated on a
+                // sled, so all we need to do to "stop" it is to update its
+                // state in the database.
+                InstanceState::Failed if matches!(requested, InstanceStateChangeRequest::Stop) => {
+                    // As discussed above, this shouldn't happen, so return an
+                    // internal error and complain about it in the logs.
+                    if vmm_state.is_some() {
+                        return Err(Error::internal_error(
+                            "an instance should not be in the Failed \
+                             effective state if it has an active VMM"
+                        ));
+                    }
 
+                    let prev_runtime = instance_state.runtime();
+                    return Ok(InstanceStateChangeRequestAction::UpdateRuntime(db::model::InstanceRuntimeState {
+                        time_updated: chrono::Utc::now(),
+                        r#gen: prev_runtime.r#gen.0.next().into(),
+                        nexus_state: db::model::InstanceState::NoVmm,
+                        ..prev_runtime.clone()
+                    }));
+                }
                 // If the instance has no sled beacuse it's been destroyed or
                 // has fallen over, reject the state change.
-                //
-                // TODO(#2825): Failed instances should be allowed to stop, but
-                // this requires a special action because there is no sled to
-                // send the request to.
                 InstanceState::Failed | InstanceState::Destroyed => {
                     return Err(Error::invalid_request(&format!(
                         "instance state cannot be changed from {}",
@@ -898,6 +928,26 @@ impl super::Nexus {
             &requested,
         )? {
             InstanceStateChangeRequestAction::AlreadyDone => Ok(()),
+            InstanceStateChangeRequestAction::UpdateRuntime(new_runtime) => {
+                let instance_id =
+                    InstanceUuid::from_untyped_uuid(prev_instance_state.id());
+                let changed = self
+                    .datastore()
+                    .instance_update_runtime(&instance_id, &new_runtime)
+                    .await
+                    .map_err(InstanceStateChangeError::Other)?;
+                if !changed {
+                    // TODO(eliza): perhaps we should refetch the instance here
+                    // and return Ok if it was in the desired state...
+                    Err(InstanceStateChangeError::Other(Error::conflict(
+                        "The instance was previously in a state that allowed \
+                         the requested state change, but the instance's state \
+                          changed before the request could be completed",
+                    )))
+                } else {
+                    Ok(())
+                }
+            }
             InstanceStateChangeRequestAction::SendToSled {
                 sled_id,
                 propolis_id,
diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs
@@ -1177,6 +1177,44 @@ async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_deleted(
     expect_instance_delete_ok(client, instance_name).await;
 }
 
+// Verifies that if a request to reboot or stop an instance fails because of a
+// 404 error from sled agent, then the instance moves to the Failed state, and
+// can then be Stopped once it has transitioned to Failed.
+#[nexus_test]
+async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_stopped(
+    cptestctx: &ControlPlaneTestContext,
+) {
+    let client = &cptestctx.external_client;
+
+    let instance_name = "losing-is-fun";
+    let instance_id = make_forgotten_instance(
+        &cptestctx,
+        instance_name,
+        InstanceAutoRestartPolicy::Never,
+    )
+    .await;
+
+    // Attempting to reboot the forgotten instance will result in a 404
+    // NO_SUCH_INSTANCE from the sled-agent, which Nexus turns into a 503.
+    expect_instance_reboot_fail(
+        client,
+        instance_name,
+        http::StatusCode::SERVICE_UNAVAILABLE,
+    )
+    .await;
+
+    // Wait for the instance to transition to Failed.
+    instance_wait_for_state(client, instance_id, InstanceState::Failed).await;
+
+    // Now, it should be possible to stop the instance.
+    let instance_next =
+        instance_post(&client, instance_name, InstanceOp::Stop).await;
+    assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped);
+
+    // Now, the Stopped nstance should be deleteable..
+    expect_instance_delete_ok(client, instance_name).await;
+}
+
 // Verifies that the instance-watcher background task transitions an instance
 // to Failed when the sled-agent returns a 404, and that the instance can be
 // deleted after it transitions to Failed.