Skip to content

Commit

Permalink
feat: re-shutdown nexus when node is online
Browse files Browse the repository at this point in the history
When nexus shutdown fails, if the node comes back online let's
attempt the shutdown again.

Signed-off-by: Tiago Castro <[email protected]>
  • Loading branch information
tiagolobocastro committed Sep 16, 2024
1 parent 03684a0 commit 00c3fdc
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub(super) mod capacity;
mod garbage_collector;
mod re_shutdown;

use crate::{
controller::{
Expand Down Expand Up @@ -59,7 +60,10 @@ impl NexusReconciler {
pub(crate) fn from(period: PollPeriods) -> Self {
NexusReconciler {
counter: PollTimer::from(period),
poll_targets: vec![Box::new(GarbageCollector::new())],
poll_targets: vec![
Box::new(GarbageCollector::new()),
Box::new(re_shutdown::ReShutdown::new()),
],
}
}
/// Return new `Self` with the default period
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use crate::controller::{
reconciler::PollTriggerEvent,
resources::operations_helper::OperationSequenceGuard,
task_poller::{PollContext, PollEvent, PollResult, PollerState, TaskPoller},
};

/// ReShutdown nexuses if node comes back online with the Nexus intact.
#[derive(Debug)]
pub(super) struct ReShutdown {}
impl ReShutdown {
/// Return a new `Self`.
pub(super) fn new() -> Self {
Self {}
}
}

#[async_trait::async_trait]
impl TaskPoller for ReShutdown {
async fn poll(&mut self, context: &PollContext) -> PollResult {
// Fetch all nexuses that are not properly shutdown
for nexus in context.registry().specs().failed_shutdown_nexuses().await {
let Some(volume_id) = &nexus.immutable_ref().owner else {
continue;
};
let Ok(_volume) = context.specs().volume(volume_id).await else {
continue;
};

let Ok(mut nexus) = nexus.operation_guard() else {
continue;
};

nexus.re_shutdown_nexus(context.registry()).await;
}
PollResult::Ok(PollerState::Idle)
}

async fn poll_timer(&mut self, _context: &PollContext) -> bool {
false
}

async fn poll_event(&mut self, context: &PollContext) -> bool {
matches!(
context.event(),
PollEvent::Triggered(PollTriggerEvent::Start)
| PollEvent::Triggered(PollTriggerEvent::NodeStateChangeOnline)
| PollEvent::Triggered(PollTriggerEvent::NodeDrain)
)
}
}
38 changes: 36 additions & 2 deletions control-plane/agents/src/bin/core/nexus/operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::{
operations_helper::{
GuardedOperationsHelper, OnCreateFail, OperationSequenceGuard, SpecOperationsHelper,
},
OperationGuardArc, TraceSpan, UpdateInnerValue,
OperationGuardArc, TraceSpan, TraceStrLog, UpdateInnerValue,
},
scheduling::resources::HealthyChildItems,
wrapper::{GetterOps, NodeWrapper},
Expand All @@ -26,7 +26,8 @@ use stor_port::types::v0::{
transport::{
child::Child,
nexus::{CreateNexus, DestroyNexus, Nexus, ResizeNexus, ShareNexus, UnshareNexus},
AddNexusChild, FaultNexusChild, NexusOwners, NodeStatus, RemoveNexusChild, ShutdownNexus,
AddNexusChild, FaultNexusChild, NexusOwners, NexusStatus, NodeStatus, RemoveNexusChild,
ShutdownNexus,
},
};

Expand Down Expand Up @@ -564,4 +565,37 @@ impl OperationGuardArc<NexusSpec> {

Ok(())
}

/// In case the previous nexus shutdown failed because the node is offline, and if the node is
/// now available, then we can shut down the nexus properly.
pub(crate) async fn re_shutdown_nexus(&mut self, registry: &Registry) {
if !self.as_ref().is_shutdown()
|| !self.as_ref().status_info().shutdown_failed()
|| self.as_ref().status_info.reshutdown()
{
return;
}

let Ok(nexus_state) = registry.nexus(self.uuid()).await else {
return;
};

if nexus_state.status == NexusStatus::Shutdown {
self.lock().status_info.set_reshutdown();
return;
}

let Ok(node) = registry.node_wrapper(&self.as_ref().node).await else {
return;
};

if node
.shutdown_nexus(&ShutdownNexus::new(self.uuid().clone(), false))
.await
.is_ok()
{
self.info("Successfully re-shutdown nexus");
self.lock().status_info.set_reshutdown();
}
}
}
13 changes: 13 additions & 0 deletions control-plane/agents/src/bin/core/volume/specs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,19 @@ impl ResourceSpecsLocked {
.collect()
}

/// Get a list of resourced NexusSpecs's which have failed to shut down.
pub(crate) async fn failed_shutdown_nexuses(&self) -> Vec<ResourceMutex<NexusSpec>> {
self.read()
.nexuses
.values()
.filter(|nexus| {
let nexus_spec = nexus.lock();
nexus_spec.is_shutdown() && nexus_spec.status_info().shutdown_failed()
})
.cloned()
.collect()
}

/// Get the resourced volume nexus target for the given volume.
pub(crate) fn volume_target_nexus_rsc(
&self,
Expand Down
13 changes: 12 additions & 1 deletion control-plane/stor-port/src/types/v0/store/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -448,15 +448,26 @@ impl From<&NexusSpec> for DestroyNexus {
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Default)]
pub struct NexusStatusInfo {
shutdown_failed: bool,
#[serde(skip)]
reshutdown: bool,
}

impl NexusStatusInfo {
/// Create a new nexus status info.
pub fn new(shutdown_failed: bool) -> NexusStatusInfo {
Self { shutdown_failed }
Self {
shutdown_failed,
reshutdown: false,
}
}
/// Check the nexus had a failed shutdown or not.
pub fn shutdown_failed(&self) -> bool {
self.shutdown_failed
}
pub fn set_reshutdown(&mut self) {
self.reshutdown = true;
}
pub fn reshutdown(&self) -> bool {
self.reshutdown
}
}

0 comments on commit 00c3fdc

Please sign in to comment.