From 00c3fdc9921620ad2fc0b8f918328898654e84bd Mon Sep 17 00:00:00 2001 From: Tiago Castro Date: Mon, 16 Sep 2024 15:59:07 +0100 Subject: [PATCH] feat: re-shutdown nexus when node is online When nexus shutdown fails, if the node comes back online let's attempt the shutdown again. Signed-off-by: Tiago Castro --- .../core/controller/reconciler/nexus/mod.rs | 6 ++- .../reconciler/nexus/re_shutdown.rs | 50 +++++++++++++++++++ .../agents/src/bin/core/nexus/operations.rs | 38 +++++++++++++- .../agents/src/bin/core/volume/specs.rs | 13 +++++ .../stor-port/src/types/v0/store/nexus.rs | 13 ++++- 5 files changed, 116 insertions(+), 4 deletions(-) create mode 100644 control-plane/agents/src/bin/core/controller/reconciler/nexus/re_shutdown.rs diff --git a/control-plane/agents/src/bin/core/controller/reconciler/nexus/mod.rs b/control-plane/agents/src/bin/core/controller/reconciler/nexus/mod.rs index 6e96f19dd..bc3df5c67 100644 --- a/control-plane/agents/src/bin/core/controller/reconciler/nexus/mod.rs +++ b/control-plane/agents/src/bin/core/controller/reconciler/nexus/mod.rs @@ -1,5 +1,6 @@ pub(super) mod capacity; mod garbage_collector; +mod re_shutdown; use crate::{ controller::{ @@ -59,7 +60,10 @@ impl NexusReconciler { pub(crate) fn from(period: PollPeriods) -> Self { NexusReconciler { counter: PollTimer::from(period), - poll_targets: vec![Box::new(GarbageCollector::new())], + poll_targets: vec![ + Box::new(GarbageCollector::new()), + Box::new(re_shutdown::ReShutdown::new()), + ], } } /// Return new `Self` with the default period diff --git a/control-plane/agents/src/bin/core/controller/reconciler/nexus/re_shutdown.rs b/control-plane/agents/src/bin/core/controller/reconciler/nexus/re_shutdown.rs new file mode 100644 index 000000000..44acc5bf4 --- /dev/null +++ b/control-plane/agents/src/bin/core/controller/reconciler/nexus/re_shutdown.rs @@ -0,0 +1,50 @@ +use crate::controller::{ + reconciler::PollTriggerEvent, + resources::operations_helper::OperationSequenceGuard, + task_poller::{PollContext, PollEvent, PollResult, PollerState, TaskPoller}, +}; + +/// ReShutdown nexuses if node comes back online with the Nexus intact. +#[derive(Debug)] +pub(super) struct ReShutdown {} +impl ReShutdown { + /// Return a new `Self`. + pub(super) fn new() -> Self { + Self {} + } +} + +#[async_trait::async_trait] +impl TaskPoller for ReShutdown { + async fn poll(&mut self, context: &PollContext) -> PollResult { + // Fetch all nexuses that are not properly shutdown + for nexus in context.registry().specs().failed_shutdown_nexuses().await { + let Some(volume_id) = &nexus.immutable_ref().owner else { + continue; + }; + let Ok(_volume) = context.specs().volume(volume_id).await else { + continue; + }; + + let Ok(mut nexus) = nexus.operation_guard() else { + continue; + }; + + nexus.re_shutdown_nexus(context.registry()).await; + } + PollResult::Ok(PollerState::Idle) + } + + async fn poll_timer(&mut self, _context: &PollContext) -> bool { + false + } + + async fn poll_event(&mut self, context: &PollContext) -> bool { + matches!( + context.event(), + PollEvent::Triggered(PollTriggerEvent::Start) + | PollEvent::Triggered(PollTriggerEvent::NodeStateChangeOnline) + | PollEvent::Triggered(PollTriggerEvent::NodeDrain) + ) + } +} diff --git a/control-plane/agents/src/bin/core/nexus/operations.rs b/control-plane/agents/src/bin/core/nexus/operations.rs index a86888bb5..4d444e31a 100644 --- a/control-plane/agents/src/bin/core/nexus/operations.rs +++ b/control-plane/agents/src/bin/core/nexus/operations.rs @@ -10,7 +10,7 @@ use crate::{ operations_helper::{ GuardedOperationsHelper, OnCreateFail, OperationSequenceGuard, SpecOperationsHelper, }, - OperationGuardArc, TraceSpan, UpdateInnerValue, + OperationGuardArc, TraceSpan, TraceStrLog, UpdateInnerValue, }, scheduling::resources::HealthyChildItems, wrapper::{GetterOps, NodeWrapper}, @@ -26,7 +26,8 @@ use stor_port::types::v0::{ transport::{ child::Child, nexus::{CreateNexus, DestroyNexus, Nexus, ResizeNexus, ShareNexus, UnshareNexus}, - AddNexusChild, FaultNexusChild, NexusOwners, NodeStatus, RemoveNexusChild, ShutdownNexus, + AddNexusChild, FaultNexusChild, NexusOwners, NexusStatus, NodeStatus, RemoveNexusChild, + ShutdownNexus, }, }; @@ -564,4 +565,37 @@ impl OperationGuardArc { Ok(()) } + + /// In case the previous nexus shutdown failed because the node is offline, and if the node is + /// now available, then we can shut down the nexus properly. + pub(crate) async fn re_shutdown_nexus(&mut self, registry: &Registry) { + if !self.as_ref().is_shutdown() + || !self.as_ref().status_info().shutdown_failed() + || self.as_ref().status_info.reshutdown() + { + return; + } + + let Ok(nexus_state) = registry.nexus(self.uuid()).await else { + return; + }; + + if nexus_state.status == NexusStatus::Shutdown { + self.lock().status_info.set_reshutdown(); + return; + } + + let Ok(node) = registry.node_wrapper(&self.as_ref().node).await else { + return; + }; + + if node + .shutdown_nexus(&ShutdownNexus::new(self.uuid().clone(), false)) + .await + .is_ok() + { + self.info("Successfully re-shutdown nexus"); + self.lock().status_info.set_reshutdown(); + } + } } diff --git a/control-plane/agents/src/bin/core/volume/specs.rs b/control-plane/agents/src/bin/core/volume/specs.rs index 3f4ad14ba..07639e145 100644 --- a/control-plane/agents/src/bin/core/volume/specs.rs +++ b/control-plane/agents/src/bin/core/volume/specs.rs @@ -647,6 +647,19 @@ impl ResourceSpecsLocked { .collect() } + /// Get a list of resourced NexusSpecs's which have failed to shut down. + pub(crate) async fn failed_shutdown_nexuses(&self) -> Vec> { + self.read() + .nexuses + .values() + .filter(|nexus| { + let nexus_spec = nexus.lock(); + nexus_spec.is_shutdown() && nexus_spec.status_info().shutdown_failed() + }) + .cloned() + .collect() + } + /// Get the resourced volume nexus target for the given volume. pub(crate) fn volume_target_nexus_rsc( &self, diff --git a/control-plane/stor-port/src/types/v0/store/nexus.rs b/control-plane/stor-port/src/types/v0/store/nexus.rs index 54069086e..72ff411aa 100644 --- a/control-plane/stor-port/src/types/v0/store/nexus.rs +++ b/control-plane/stor-port/src/types/v0/store/nexus.rs @@ -448,15 +448,26 @@ impl From<&NexusSpec> for DestroyNexus { #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Default)] pub struct NexusStatusInfo { shutdown_failed: bool, + #[serde(skip)] + reshutdown: bool, } impl NexusStatusInfo { /// Create a new nexus status info. pub fn new(shutdown_failed: bool) -> NexusStatusInfo { - Self { shutdown_failed } + Self { + shutdown_failed, + reshutdown: false, + } } /// Check the nexus had a failed shutdown or not. pub fn shutdown_failed(&self) -> bool { self.shutdown_failed } + pub fn set_reshutdown(&mut self) { + self.reshutdown = true; + } + pub fn reshutdown(&self) -> bool { + self.reshutdown + } }