Skip to content

Commit

Permalink
chore(bors): merge pull request #862
Browse files Browse the repository at this point in the history
862: Don't destroy replicas part of shutdown nexuses r=tiagolobocastro a=tiagolobocastro

    feat: re-shutdown nexus when node is online
    
    When nexus shutdown fails, if the node comes back online let's
    attempt the shutdown again.
    
    Signed-off-by: Tiago Castro <[email protected]>

---

    fix: don't disown replica from unshutdown nexus
    
    In case nexus shutdown failed we used to disown the replica from
    both the volume and the nexus.
    This can be a problem if the nexus is still running as the io-engine
    does not handle it gracefully, leading into pool lock issues.
    Instead, simply disown the replica from the volume and not the nexus.
    
    Signed-off-by: Tiago Castro <[email protected]>


Co-authored-by: Tiago Castro <[email protected]>
  • Loading branch information
mayastor-bors and tiagolobocastro committed Sep 18, 2024
2 parents c88625e + 00c3fdc commit bebdf8a
Show file tree
Hide file tree
Showing 7 changed files with 332 additions and 65 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub(super) mod capacity;
mod garbage_collector;
mod re_shutdown;

use crate::{
controller::{
Expand Down Expand Up @@ -59,7 +60,10 @@ impl NexusReconciler {
pub(crate) fn from(period: PollPeriods) -> Self {
NexusReconciler {
counter: PollTimer::from(period),
poll_targets: vec![Box::new(GarbageCollector::new())],
poll_targets: vec![
Box::new(GarbageCollector::new()),
Box::new(re_shutdown::ReShutdown::new()),
],
}
}
/// Return new `Self` with the default period
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use crate::controller::{
reconciler::PollTriggerEvent,
resources::operations_helper::OperationSequenceGuard,
task_poller::{PollContext, PollEvent, PollResult, PollerState, TaskPoller},
};

/// ReShutdown nexuses if node comes back online with the Nexus intact.
#[derive(Debug)]
pub(super) struct ReShutdown {}
impl ReShutdown {
/// Return a new `Self`.
pub(super) fn new() -> Self {
Self {}
}
}

#[async_trait::async_trait]
impl TaskPoller for ReShutdown {
async fn poll(&mut self, context: &PollContext) -> PollResult {
// Fetch all nexuses that are not properly shutdown
for nexus in context.registry().specs().failed_shutdown_nexuses().await {
let Some(volume_id) = &nexus.immutable_ref().owner else {
continue;
};
let Ok(_volume) = context.specs().volume(volume_id).await else {
continue;
};

let Ok(mut nexus) = nexus.operation_guard() else {
continue;
};

nexus.re_shutdown_nexus(context.registry()).await;
}
PollResult::Ok(PollerState::Idle)
}

async fn poll_timer(&mut self, _context: &PollContext) -> bool {
false
}

async fn poll_event(&mut self, context: &PollContext) -> bool {
matches!(
context.event(),
PollEvent::Triggered(PollTriggerEvent::Start)
| PollEvent::Triggered(PollTriggerEvent::NodeStateChangeOnline)
| PollEvent::Triggered(PollTriggerEvent::NodeDrain)
)
}
}
Original file line number Diff line number Diff line change
@@ -1,22 +1,15 @@
use crate::controller::{
reconciler::{GarbageCollect, PollContext, TaskPoller},
reconciler::{poller::ReconcilerWorker, GarbageCollect, PollContext, TaskPoller},
resources::{
operations::{ResourceLifecycle, ResourceResize},
operations::{ResourceLifecycle, ResourceOwnerUpdate, ResourceResize},
operations_helper::{OperationSequenceGuard, SpecOperationsHelper},
OperationGuardArc, TraceSpan, TraceStrLog,
OperationGuardArc, ResourceUid, TraceSpan, TraceStrLog,
},
task_poller::{PollEvent, PollResult, PollTimer, PollTriggerEvent, PollerState},
};

use crate::controller::{
reconciler::poller::ReconcilerWorker,
resources::{operations::ResourceOwnerUpdate, ResourceMutex, ResourceUid},
};
use agents::errors::SvcError;
use stor_port::types::v0::{
store::{
nexus::NexusSpec, nexus_persistence::NexusInfo, replica::ReplicaSpec, volume::VolumeSpec,
},
store::{nexus_persistence::NexusInfo, replica::ReplicaSpec, volume::VolumeSpec},
transport::{DestroyVolume, NexusOwners, ReplicaOwners, ResizeReplica, VolumeStatus},
};
use tracing::Instrument;
Expand Down Expand Up @@ -345,62 +338,42 @@ async fn disown_non_reservable_replicas(
let mut results = vec![];

// Fetch all nexuses that are not properly shutdown
let shutdown_failed_nexuses: Vec<ResourceMutex<NexusSpec>> = context
let shutdown_failed_nexuses = context
.registry()
.specs()
.volume_failed_shutdown_nexuses(volume.uuid())
.await;

// Remove the local child from the shutdown pending nexuses as they are
// non reservable.
// Remove the local children from the volume as they are non-reservable.
for nexus in shutdown_failed_nexuses {
if target.uuid() == nexus.uuid() {
continue;
}

let children = nexus.lock().clone().children;
for child in children {
if let Some(replica_uri) = child.as_replica() {
if replica_uri.uri().is_local() {
if let Ok(mut replica) = context.specs().replica(replica_uri.uuid()).await {
if !replica.as_ref().dirty()
&& replica.as_ref().owners.owned_by(volume.uuid())
&& !replica
.as_ref()
.owners
.nexuses()
.iter()
.any(|p| p != nexus.uuid())
&& !is_replica_healthy(
context,
&mut nexus_info,
replica.as_ref(),
volume.as_ref(),
)
.await?
{
volume.warn_span(|| tracing::warn!(replica.uuid = %replica.as_ref().uuid, "Attempting to disown non reservable replica"));

// Since its a local replica of a shutting down nexus i.e. a failed
// shutdown nexus we don't want it
// to be a part of anything. Once this replica has no owner, the
// garbage collector will remove the nexus. Even if the clean up is
// delayed we since this replica is
// anyhow not a part of the volume, we should be safe.
match replica
.remove_owners(
context.registry(),
&ReplicaOwners::new_disown_all(),
true,
)
.await
{
Ok(_) => {
volume.info_span(|| tracing::info!(replica.uuid = %replica.as_ref().uuid, "Successfully disowned non reservable replica"));
}
Err(error) => {
volume.error_span(|| tracing::error!(replica.uuid = %replica.as_ref().uuid, "Failed to disown non reservable replica"));
results.push(Err(error));
}
}
}
}
let Some(replica_uri) = child.as_replica() else {
continue;
};
if !replica_uri.uri().is_local() {
continue;
}
let Ok(replica) = context.specs().replica(replica_uri.uuid()).await else {
continue;
};
if !replica.as_ref().dirty()
&& replica.as_ref().owners.owned_by(volume.uuid())
&& !replica
.as_ref()
.owners
.nexuses()
.iter()
.any(|p| p != nexus.uuid())
&& !is_replica_healthy(context, &mut nexus_info, replica.as_ref(), volume.as_ref())
.await?
{
if let Err(error) = disown_non_reservable_replica(volume, replica, context).await {
results.push(Err(error));
}
}
}
Expand All @@ -409,6 +382,36 @@ async fn disown_non_reservable_replicas(
GarbageCollector::squash_results(results)
}

async fn disown_non_reservable_replica(
volume: &OperationGuardArc<VolumeSpec>,
mut replica: OperationGuardArc<ReplicaSpec>,
context: &PollContext,
) -> PollResult {
volume.warn_span(|| tracing::warn!(replica.uuid = %replica.as_ref().uuid, "Attempting to disown non reservable replica"));

// Since it's a local replica of a shutting down nexus i.e. a failed
// shutdown nexus we don't want to be a part of anything until such nexus is destroyed.
// Even if the cleanup is delayed, since this replica is
// anyhow not a part of the new volume nexus, we should be safe.
match replica
.remove_owners(
context.registry(),
&ReplicaOwners::new(Some(volume.uuid().clone()), vec![]),
true,
)
.await
{
Ok(_) => {
volume.info_span(|| tracing::info!(replica.uuid = %replica.as_ref().uuid, "Successfully disowned non reservable replica"));
PollResult::Ok(PollerState::Idle)
}
Err(error) => {
volume.error_span(|| tracing::error!(replica.uuid = %replica.as_ref().uuid, "Failed to disown non reservable replica"));
PollResult::Err(error)
}
}
}

async fn is_replica_healthy(
context: &PollContext,
nexus_info: &mut Option<NexusInfo>,
Expand Down
38 changes: 36 additions & 2 deletions control-plane/agents/src/bin/core/nexus/operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::{
operations_helper::{
GuardedOperationsHelper, OnCreateFail, OperationSequenceGuard, SpecOperationsHelper,
},
OperationGuardArc, TraceSpan, UpdateInnerValue,
OperationGuardArc, TraceSpan, TraceStrLog, UpdateInnerValue,
},
scheduling::resources::HealthyChildItems,
wrapper::{GetterOps, NodeWrapper},
Expand All @@ -26,7 +26,8 @@ use stor_port::types::v0::{
transport::{
child::Child,
nexus::{CreateNexus, DestroyNexus, Nexus, ResizeNexus, ShareNexus, UnshareNexus},
AddNexusChild, FaultNexusChild, NexusOwners, NodeStatus, RemoveNexusChild, ShutdownNexus,
AddNexusChild, FaultNexusChild, NexusOwners, NexusStatus, NodeStatus, RemoveNexusChild,
ShutdownNexus,
},
};

Expand Down Expand Up @@ -564,4 +565,37 @@ impl OperationGuardArc<NexusSpec> {

Ok(())
}

/// In case the previous nexus shutdown failed because the node is offline, and if the node is
/// now available, then we can shut down the nexus properly.
pub(crate) async fn re_shutdown_nexus(&mut self, registry: &Registry) {
if !self.as_ref().is_shutdown()
|| !self.as_ref().status_info().shutdown_failed()
|| self.as_ref().status_info.reshutdown()
{
return;
}

let Ok(nexus_state) = registry.nexus(self.uuid()).await else {
return;
};

if nexus_state.status == NexusStatus::Shutdown {
self.lock().status_info.set_reshutdown();
return;
}

let Ok(node) = registry.node_wrapper(&self.as_ref().node).await else {
return;
};

if node
.shutdown_nexus(&ShutdownNexus::new(self.uuid().clone(), false))
.await
.is_ok()
{
self.info("Successfully re-shutdown nexus");
self.lock().status_info.set_reshutdown();
}
}
}
Loading

0 comments on commit bebdf8a

Please sign in to comment.