Skip to content

Commit

Permalink
chore(bors): merge pull request #639
Browse files Browse the repository at this point in the history
639: fix(nexus/replica): check with replica owner before destroying it r=tiagolobocastro a=tiagolobocastro

A user hit a very weird situation where there were 2 created nexuses containing the same replica. How can this happen? A potential situation is fixed where we now collect volume state AFTER we get the volume guard, though it's a very tight race so I suspect something else might still be at play here..

Irregardless of how this can happen we now plug the hole by ensuring we always check wit the volume replica removal logic before attempting to disown and destroy a replica.

Co-authored-by: Tiago Castro <[email protected]>
  • Loading branch information
mayastor-bors and tiagolobocastro committed Jul 17, 2023
2 parents c660b0e + b2ce7c0 commit f1f5a5b
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 64 deletions.
16 changes: 16 additions & 0 deletions .github/workflows/pr-submodule-branch.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: Submodule Branch Check
on:
pull_request:
types: ['opened', 'edited', 'reopened', 'synchronize']
push:
branches:
- 'release/**'
- staging
jobs:
submodule-branch:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Check root submodules branch
run: echo "Compat pass"

Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,24 @@ async fn hot_spare_reconcile(
volume_spec: &Arc<Mutex<VolumeSpec>>,
context: &PollContext,
) -> PollResult {
let uuid = volume_spec.lock().uuid.clone();
let volume_state = context.registry().get_volume_state(&uuid).await?;
let _guard = match volume_spec.operation_guard(OperationMode::ReconcileStart) {
Ok(guard) => guard,
Err(_) => return PollResult::Ok(PollerState::Busy),
};
let mode = OperationMode::ReconcileStep;

if !volume_spec.lock().policy.self_heal {
let volume_spec_cln = volume_spec.lock().clone();
if !volume_spec_cln.policy.self_heal {
return PollResult::Ok(PollerState::Idle);
}
if !volume_spec.lock().status.created() {
if !volume_spec_cln.status.created() {
return PollResult::Ok(PollerState::Idle);
}

let volume_state = context
.registry()
.get_volume_spec_state(volume_spec_cln)
.await?;
let mode = OperationMode::ReconcileStep;

match volume_state.status {
VolumeStatus::Online => volume_replica_count_reconciler(volume_spec, context, mode).await,
VolumeStatus::Unknown | VolumeStatus::Degraded => {
Expand Down
46 changes: 16 additions & 30 deletions control-plane/agents/core/src/nexus/specs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::core::{
};
use common::errors::{NexusNotFound, SvcError};
use common_lib::{
mbus_api::{ErrorChain, ResourceKind},
mbus_api::ResourceKind,
types::v0::{
message_bus::{
AddNexusChild, AddNexusReplica, Child, ChildUri, CreateNexus, DestroyNexus, Nexus,
Expand All @@ -16,7 +16,7 @@ use common_lib::{
nexus::{NexusOperation, NexusSpec},
nexus_child::NexusChild,
replica::ReplicaSpec,
OperationMode, SpecStatus, SpecTransaction, TraceSpan,
OperationMode, SpecStatus, SpecTransaction,
},
},
};
Expand Down Expand Up @@ -433,37 +433,23 @@ impl ResourceSpecsLocked {
.ok_or(SvcError::ReplicaNotFound {
replica_id: replica.uuid().clone(),
})?;
let pool_id = replica_spec.lock().pool.clone();
match Self::get_pool_node(registry, pool_id).await {
Some(node) => {
if let Err(error) = self
.disown_and_destroy_replica(registry, &node, replica.uuid())
.await
{
nexus_spec.lock().clone().error_span(|| {
tracing::error!(
replica.uuid = %replica.uuid(),
error = %error.full_string(),
"Failed to disown and destroy the replica"
)
});
}
}
None => {
// The replica node was not found (perhaps because it is offline).
// The replica can't be destroyed because the node isn't there.
// Instead, disown the replica from the volume and let the garbage
// collector destroy it later.
nexus_spec.lock().clone().warn_span(|| {
tracing::warn!(
replica.uuid = %replica.uuid(),
"Failed to find the node where the replica is hosted"
if !replica_spec.lock().owners.owned_by_a_nexus() {
let owner_volume = {
let replica_spec = replica_spec.lock();
replica_spec.owners.volume().cloned()
};
if let Some(owner) = owner_volume {
if let Some(volume) = self.get_locked_volume(&owner) {
self.remove_unused_volume_replica(
registry,
&volume,
replica.uuid(),
mode,
)
});
let _ = self.disown_volume_replica(registry, &replica_spec).await;
.await?;
}
}
}

Ok(())
}
result => result,
Expand Down
11 changes: 11 additions & 0 deletions control-plane/agents/core/src/volume/registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,17 @@ impl Registry {
.await
}

/// Get the volume state for the specified volume spec.
pub(crate) async fn get_volume_spec_state(
&self,
volume_spec: VolumeSpec,
) -> Result<VolumeState, SvcError> {
let replica_specs = self.specs().get_cloned_volume_replicas(&volume_spec.uuid);

self.get_volume_state_with_replicas(&volume_spec, &replica_specs)
.await
}

/// Get the volume state for the specified volume
#[tracing::instrument(level = "info", skip(self, volume_spec, replicas))]
pub(crate) async fn get_volume_state_with_replicas(
Expand Down
33 changes: 13 additions & 20 deletions control-plane/agents/core/src/volume/specs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,8 @@ impl ResourceSpecsLocked {
{
Ok(_) => Ok(()),
Err(error) if !request.force() => Err(error),
Err(error @ SvcError::Conflict {}) => Err(error),
Err(error @ SvcError::StoreSave { .. }) => Err(error),
Err(error) => {
let node_online = match registry.get_node_wrapper(&nexus_clone.node).await {
Ok(node) => {
Expand Down Expand Up @@ -1053,6 +1055,17 @@ impl ResourceSpecsLocked {
.await
{
Ok(_) => Ok(()),
Err(SvcError::GrpcRequestError {
source,
request,
resource,
}) if source.code() == tonic::Code::DeadlineExceeded => {
Err(SvcError::GrpcRequestError {
source,
request,
resource,
})
}
Err(error) => {
if let Some(replica) = self.get_replica(&replica.uuid) {
let mut replica = replica.lock();
Expand Down Expand Up @@ -1333,26 +1346,6 @@ impl ResourceSpecsLocked {
.await
}

/// Disown and destroy the replica from its volume
pub(crate) async fn disown_and_destroy_replica(
&self,
registry: &Registry,
node: &NodeId,
replica_uuid: &ReplicaId,
) -> Result<(), SvcError> {
if let Some(replica) = self.get_replica(replica_uuid) {
// disown it from the volume first, so at the very least it can be garbage collected
// at a later point if the node is not accessible
self.disown_volume_replica(registry, &replica).await?;
self.destroy_volume_replica(registry, Some(node), &replica)
.await
} else {
Err(SvcError::ReplicaNotFound {
replica_id: replica_uuid.to_owned(),
})
}
}

/// Remove volume by its `id`
pub(super) fn remove_volume(&self, id: &VolumeId) {
let mut specs = self.write();
Expand Down
143 changes: 135 additions & 8 deletions control-plane/agents/core/src/volume/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ use common_lib::{
ReplicaId, ReplicaOwners, VolumeId,
},
openapi::{models, models::NodeStatus, tower::client::Error},
store::{definitions::StorableObject, volume::VolumeSpec},
store::{
definitions::StorableObject, nexus::ReplicaUri, nexus_child::NexusChild,
volume::VolumeSpec,
},
},
};
use std::{
Expand Down Expand Up @@ -903,26 +906,38 @@ async fn wait_till_volume(volume: &VolumeId, replicas: usize) {
}
}

/// Wait for a volume to reach the provided status
async fn wait_till_volume_status(cluster: &Cluster, volume: &Uuid, status: models::VolumeStatus) {
let timeout = Duration::from_secs(RECONCILE_TIMEOUT_SECS);
/// Wait for a volume to reach the provided status with timeout.
async fn wait_till_volume_status_timeout(
cluster: &Cluster,
volume: &Uuid,
status: models::VolumeStatus,
timeout: Duration,
) -> Result<(), String> {
let start = std::time::Instant::now();
loop {
let volume = cluster.rest_v00().volumes_api().get_volume(volume).await;
if volume.as_ref().unwrap().state.status == status {
return;
return Ok(());
}

if std::time::Instant::now() > (start + timeout) {
panic!(
"Timeout waiting for the volume to reach the specified status ('{:?}'), current: '{:?}'",
status, volume
return Err(
format!("Timeout waiting for the volume to reach the specified status ('{:?}'), current: '{:?}'",
status, volume)
);
}
tokio::time::sleep(Duration::from_millis(500)).await;
}
}

/// Wait for a volume to reach the provided status.
async fn wait_till_volume_status(cluster: &Cluster, volume: &Uuid, status: models::VolumeStatus) {
let timeout = Duration::from_secs(RECONCILE_TIMEOUT_SECS);
wait_till_volume_status_timeout(cluster, volume, status, timeout)
.await
.unwrap();
}

/// Either fault the local replica, the remote, or set the nexus as having an unclean shutdown
#[derive(Debug)]
enum FaultTest {
Expand Down Expand Up @@ -1499,3 +1514,115 @@ async fn smoke_test() {
assert!(GetNexuses::default().request().await.unwrap().0.is_empty());
assert!(GetReplicas::default().request().await.unwrap().0.is_empty());
}

/// When a second nexus with the same child is created for some reason, ensure that removing
/// a replica doesn't cause the replica to be disowned from the volume and destroyed.
/// This is something that shouldn't happen to begin with but this adds a safety net just in case.
#[tokio::test]
async fn duplicate_nexus_missing_children() {
let reconcile_period = Duration::from_millis(100);
let cluster = ClusterBuilder::builder()
.with_rest(true)
.with_agents(vec!["core"])
.with_mayastors(2)
.with_pool(1, "malloc:///d?size_mb=100")
.with_cache_period("100ms")
.with_reconcile_period(reconcile_period, reconcile_period)
.build()
.await
.unwrap();
let nodes = GetNodes::default().request().await.unwrap();
tracing::info!("Nodes: {:?}", nodes);

let volume = CreateVolume {
uuid: "1e3cf927-80c2-47a8-adf0-95c486bdd7b7".try_into().unwrap(),
size: 5242880,
replicas: 1,
..Default::default()
}
.request()
.await
.unwrap();

let fake_volume = CreateVolume {
uuid: "2e3cf927-80c2-47a8-adf0-95c486bdd7b7".try_into().unwrap(),
size: 5242880,
replicas: 1,
..Default::default()
}
.request()
.await
.unwrap();

let volume = PublishVolume::new(volume.spec().uuid.clone(), Some(cluster.node(0)), None)
.request()
.await
.unwrap();

tracing::info!("Volume: {:?}", volume);
let volume_state = volume.state();
let nexus = volume_state.target.unwrap().clone();

let child = nexus.children.first().cloned().unwrap();
let replica_uri = ReplicaUri::new(
&ReplicaId::try_from(child.uri.uuid_str().unwrap()).unwrap(),
&child.uri,
);

let local = "malloc:///local?size_mb=12&uuid=4a7b0566-8ec6-49e0-a8b2-1d9a292cf59b".into();

let bad_nexus = CreateNexus {
node: cluster.node(1),
uuid: NexusId::try_from("f086f12c-1728-449e-be32-9415051090d6").unwrap(),
size: 5242880,
children: vec![NexusChild::Replica(replica_uri), local],
managed: true,
// pretend this nexus is from another volume so it won't be deleted..
owner: Some(fake_volume.uuid().clone()),
..Default::default()
}
.request()
.await
.unwrap();

let nexuses = GetNexuses::default().request().await.unwrap().0;
tracing::info!("Nexuses: {:?}", nexuses);

let mut rpc_handle = cluster
.composer()
.grpc_handle(cluster.node(1).as_str())
.await
.unwrap();

let children_before_fault = volume_children(volume.uuid()).await;
tracing::info!("volume children: {:?}", children_before_fault);

let missing_child = child.uri.to_string();
rpc_handle
.mayastor
.remove_child_nexus(rpc::mayastor::RemoveChildNexusRequest {
uuid: bad_nexus.uuid.to_string(),
uri: missing_child.clone(),
})
.await
.unwrap();

tracing::debug!(
"Nexus: {:?}",
rpc_handle
.mayastor
.list_nexus(rpc::mayastor::Null {})
.await
.unwrap()
);

// There no easy way to check for a negative here, just wait for 2 garbage reconcilers.
wait_till_volume_status_timeout(
&cluster,
volume.uuid(),
models::VolumeStatus::Faulted,
reconcile_period * 10,
)
.await
.expect_err("Should not get faulted!");
}

0 comments on commit f1f5a5b

Please sign in to comment.