From 58947b3e00ce7a93943fa275507542829e16c35e Mon Sep 17 00:00:00 2001 From: Abhilash Shetty Date: Thu, 25 May 2023 13:55:34 +0000 Subject: [PATCH] fix(nexus_cleanup): cleanup old nexuses post volume destroy Signed-off-by: Abhilash Shetty --- .../reconciler/nexus/garbage_collector.rs | 5 +- .../src/bin/core/tests/volume/switchover.rs | 163 +++++++++++++++++- .../agents/src/bin/core/volume/operations.rs | 4 +- 3 files changed, 163 insertions(+), 9 deletions(-) diff --git a/control-plane/agents/src/bin/core/controller/reconciler/nexus/garbage_collector.rs b/control-plane/agents/src/bin/core/controller/reconciler/nexus/garbage_collector.rs index a148e47c5..e90bf054d 100644 --- a/control-plane/agents/src/bin/core/controller/reconciler/nexus/garbage_collector.rs +++ b/control-plane/agents/src/bin/core/controller/reconciler/nexus/garbage_collector.rs @@ -33,10 +33,7 @@ impl TaskPoller for GarbageCollector { Ok(guard) => guard, Err(_) => continue, }; - // if the nexus is in shutdown state don't let the reconcilers act on it. - if !nexus.lock().is_shutdown() { - let _ = nexus.garbage_collect(context).await; - } + let _ = nexus.garbage_collect(context).await; } PollResult::Ok(PollerState::Idle) } diff --git a/control-plane/agents/src/bin/core/tests/volume/switchover.rs b/control-plane/agents/src/bin/core/tests/volume/switchover.rs index b8c996b21..6079ef833 100644 --- a/control-plane/agents/src/bin/core/tests/volume/switchover.rs +++ b/control-plane/agents/src/bin/core/tests/volume/switchover.rs @@ -2,22 +2,177 @@ use deployer_cluster::{Cluster, ClusterBuilder}; use grpc::operations::{ - pool::traits::PoolOperations, registry::traits::RegistryOperations, - replica::traits::ReplicaOperations, volume::traits::VolumeOperations, + nexus::traits::NexusOperations, pool::traits::PoolOperations, + registry::traits::RegistryOperations, replica::traits::ReplicaOperations, + volume::traits::VolumeOperations, }; use std::{collections::HashMap, time::Duration}; use stor_port::{ transport_api::{ReplyErrorKind, ResourceKind}, types::v0::{ + openapi::models::SpecStatus, store::nexus::NexusSpec, transport::{ - CreateVolume, DestroyShutdownTargets, Filter, GetSpecs, Nexus, PublishVolume, - RepublishVolume, VolumeShareProtocol, + CreateVolume, DestroyShutdownTargets, DestroyVolume, Filter, GetSpecs, Nexus, + PublishVolume, RepublishVolume, VolumeShareProtocol, }, }, }; use tokio::time::sleep; +// This test: Creates a three io engine cluster +// Creates volume with 2 replicas and publishes it to create nexus +// Republishes nexus to a new node +// Stops node housing a previous nexus +// Destroys volume +// Starts node which was stopped and expects old nexus cleanup. +#[tokio::test] +async fn old_nexus_delete_after_vol_destroy() { + let reconcile_period = Duration::from_secs(1); + let cluster = ClusterBuilder::builder() + .with_rest(true) + .with_agents(vec!["core"]) + .with_io_engines(3) + .with_tmpfs_pool(POOL_SIZE_BYTES) + .with_cache_period("1s") + .with_reconcile_period(reconcile_period, reconcile_period) + .build() + .await + .unwrap(); + + let vol_client = cluster.grpc_client().volume(); + let nexus_client = cluster.grpc_client().nexus(); + let rest_client = cluster.rest_v00(); + let spec_client = rest_client.specs_api(); + + let vol = vol_client + .create( + &CreateVolume { + uuid: "ec4e66fd-3b33-4439-b504-d49aba53da26".try_into().unwrap(), + size: 5242880, + replicas: 2, + ..Default::default() + }, + None, + ) + .await + .expect("unable to create volume"); + + let vol = vol_client + .publish( + &PublishVolume { + uuid: vol.uuid().clone(), + share: None, + target_node: Some(cluster.node(0)), + publish_context: HashMap::new(), + frontend_nodes: vec![cluster.node(1).to_string()], + }, + None, + ) + .await + .expect("failed to publish volume"); + + let target_node = vol.state().target.unwrap().node; + assert_eq!(target_node, cluster.node(0)); + + let vol = vol_client + .republish( + &RepublishVolume { + uuid: vol.uuid().clone(), + target_node: Some(cluster.node(1)), + share: VolumeShareProtocol::Nvmf, + reuse_existing: false, + frontend_node: cluster.node(1), + reuse_existing_fallback: false, + }, + None, + ) + .await + .expect("unable to republish volume"); + + let republished_node = vol.state().target.unwrap().node; + assert_ne!( + republished_node, target_node, + "expected nexus node to change post republish" + ); + + let nexuses = nexus_client + .get(Filter::None, None) + .await + .expect("Could not list nexuses"); + assert_eq!(nexuses.0.len(), 2, "expected two nexuses after republish"); + + cluster + .composer() + .stop(cluster.node(0).as_str()) + .await + .expect("failed to stop container"); + + let nexuses_after_pause = nexus_client + .get(Filter::None, None) + .await + .expect("could not list nexuses"); + sleep(Duration::from_secs(2)).await; + + assert_eq!( + nexuses_after_pause.0.len(), + 1, + "expected 1 nexus after stop" + ); + + let spec = spec_client + .get_specs() + .await + .expect("expected to retrieve specs"); + + let nexus_spec = spec.nexuses; + assert_eq!(nexus_spec.len(), 2, "spec should contain 2 nexuses"); + + vol_client + .destroy( + &DestroyVolume { + uuid: vol.uuid().clone(), + }, + None, + ) + .await + .expect("failed to delete volume"); + + let spec = spec_client + .get_specs() + .await + .expect("expected to retrieve specs"); + + let nexus_spec = spec.nexuses; + assert_eq!( + nexus_spec.len(), + 1, + "spec should contain one nexus after vol destroy with old target node unreachable" + ); + + let n = nexus_spec.get(0).unwrap(); + assert_eq!( + n.status, + SpecStatus::Deleting, + "failed to mark nexus spec as Deleting" + ); + + cluster + .composer() + .start(cluster.node(0).as_str()) + .await + .expect("failed to start container"); + sleep(Duration::from_secs(5)).await; + + let spec = spec_client + .get_specs() + .await + .expect("expected to retrieve specs"); + + let nexus_spec = spec.nexuses; + assert_eq!(nexus_spec.len(), 0, "spec should not contain any nexus"); +} + #[tokio::test] async fn lazy_delete_shutdown_targets() { const POOL_SIZE_BYTES: u64 = 128 * 1024 * 1024; diff --git a/control-plane/agents/src/bin/core/volume/operations.rs b/control-plane/agents/src/bin/core/volume/operations.rs index 832e41d2f..e01ce2c28 100644 --- a/control-plane/agents/src/bin/core/volume/operations.rs +++ b/control-plane/agents/src/bin/core/volume/operations.rs @@ -151,7 +151,9 @@ impl ResourceLifecycle for OperationGuardArc { let nexus = nexus_arc.lock().deref().clone(); match nexus_arc.operation_guard_wait().await { Ok(mut guard) => { - let destroy = DestroyNexus::from(&nexus).with_disown(&request.uuid); + let destroy = DestroyNexus::from(&nexus) + .with_disown(&request.uuid) + .with_lazy(true); if let Err(error) = guard.destroy(registry, &destroy).await { nexus.warn_span(|| { tracing::warn!(