From ea6eb96c4bd62c4270fe2267266592be5b4f6a2a Mon Sep 17 00:00:00 2001
From: Tiago Castro <tiagolobocastro@gmail.com>
Date: Fri, 14 Jul 2023 20:21:07 +0100
Subject: [PATCH 1/3] test(nexus/replica): check if replica is destroyed after
 nexus removal

When a shared volume replica is removed from a nexus it should not be deleted!

Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com>
---
 control-plane/agents/core/src/volume/tests.rs | 143 +++++++++++++++++-
 1 file changed, 135 insertions(+), 8 deletions(-)

diff --git a/control-plane/agents/core/src/volume/tests.rs b/control-plane/agents/core/src/volume/tests.rs
index f7574e891..a9e4b5fab 100644
--- a/control-plane/agents/core/src/volume/tests.rs
+++ b/control-plane/agents/core/src/volume/tests.rs
@@ -30,7 +30,10 @@ use common_lib::{
             ReplicaId, ReplicaOwners, VolumeId,
         },
         openapi::{models, models::NodeStatus, tower::client::Error},
-        store::{definitions::StorableObject, volume::VolumeSpec},
+        store::{
+            definitions::StorableObject, nexus::ReplicaUri, nexus_child::NexusChild,
+            volume::VolumeSpec,
+        },
     },
 };
 use std::{
@@ -903,26 +906,38 @@ async fn wait_till_volume(volume: &VolumeId, replicas: usize) {
     }
 }
 
-/// Wait for a volume to reach the provided status
-async fn wait_till_volume_status(cluster: &Cluster, volume: &Uuid, status: models::VolumeStatus) {
-    let timeout = Duration::from_secs(RECONCILE_TIMEOUT_SECS);
+/// Wait for a volume to reach the provided status with timeout.
+async fn wait_till_volume_status_timeout(
+    cluster: &Cluster,
+    volume: &Uuid,
+    status: models::VolumeStatus,
+    timeout: Duration,
+) -> Result<(), String> {
     let start = std::time::Instant::now();
     loop {
         let volume = cluster.rest_v00().volumes_api().get_volume(volume).await;
         if volume.as_ref().unwrap().state.status == status {
-            return;
+            return Ok(());
         }
 
         if std::time::Instant::now() > (start + timeout) {
-            panic!(
-                "Timeout waiting for the volume to reach the specified status ('{:?}'), current: '{:?}'",
-                status, volume
+            return Err(
+                format!("Timeout waiting for the volume to reach the specified status ('{:?}'), current: '{:?}'",
+                status, volume)
             );
         }
         tokio::time::sleep(Duration::from_millis(500)).await;
     }
 }
 
+/// Wait for a volume to reach the provided status.
+async fn wait_till_volume_status(cluster: &Cluster, volume: &Uuid, status: models::VolumeStatus) {
+    let timeout = Duration::from_secs(RECONCILE_TIMEOUT_SECS);
+    wait_till_volume_status_timeout(cluster, volume, status, timeout)
+        .await
+        .unwrap();
+}
+
 /// Either fault the local replica, the remote, or set the nexus as having an unclean shutdown
 #[derive(Debug)]
 enum FaultTest {
@@ -1499,3 +1514,115 @@ async fn smoke_test() {
     assert!(GetNexuses::default().request().await.unwrap().0.is_empty());
     assert!(GetReplicas::default().request().await.unwrap().0.is_empty());
 }
+
+/// When a second nexus with the same child is created for some reason, ensure that removing
+/// a replica doesn't cause the replica to be disowned from the volume and destroyed.
+/// This is something that shouldn't happen to begin with but this adds a safety net just in case.
+#[tokio::test]
+async fn duplicate_nexus_missing_children() {
+    let reconcile_period = Duration::from_millis(100);
+    let cluster = ClusterBuilder::builder()
+        .with_rest(true)
+        .with_agents(vec!["core"])
+        .with_mayastors(2)
+        .with_pool(1, "malloc:///d?size_mb=100")
+        .with_cache_period("100ms")
+        .with_reconcile_period(reconcile_period, reconcile_period)
+        .build()
+        .await
+        .unwrap();
+    let nodes = GetNodes::default().request().await.unwrap();
+    tracing::info!("Nodes: {:?}", nodes);
+
+    let volume = CreateVolume {
+        uuid: "1e3cf927-80c2-47a8-adf0-95c486bdd7b7".try_into().unwrap(),
+        size: 5242880,
+        replicas: 1,
+        ..Default::default()
+    }
+    .request()
+    .await
+    .unwrap();
+
+    let fake_volume = CreateVolume {
+        uuid: "2e3cf927-80c2-47a8-adf0-95c486bdd7b7".try_into().unwrap(),
+        size: 5242880,
+        replicas: 1,
+        ..Default::default()
+    }
+    .request()
+    .await
+    .unwrap();
+
+    let volume = PublishVolume::new(volume.spec().uuid.clone(), Some(cluster.node(0)), None)
+        .request()
+        .await
+        .unwrap();
+
+    tracing::info!("Volume: {:?}", volume);
+    let volume_state = volume.state();
+    let nexus = volume_state.target.unwrap().clone();
+
+    let child = nexus.children.first().cloned().unwrap();
+    let replica_uri = ReplicaUri::new(
+        &ReplicaId::try_from(child.uri.uuid_str().unwrap()).unwrap(),
+        &child.uri,
+    );
+
+    let local = "malloc:///local?size_mb=12&uuid=4a7b0566-8ec6-49e0-a8b2-1d9a292cf59b".into();
+
+    let bad_nexus = CreateNexus {
+        node: cluster.node(1),
+        uuid: NexusId::try_from("f086f12c-1728-449e-be32-9415051090d6").unwrap(),
+        size: 5242880,
+        children: vec![NexusChild::Replica(replica_uri), local],
+        managed: true,
+        // pretend this nexus is from another volume so it won't be deleted..
+        owner: Some(fake_volume.uuid().clone()),
+        ..Default::default()
+    }
+    .request()
+    .await
+    .unwrap();
+
+    let nexuses = GetNexuses::default().request().await.unwrap().0;
+    tracing::info!("Nexuses: {:?}", nexuses);
+
+    let mut rpc_handle = cluster
+        .composer()
+        .grpc_handle(cluster.node(1).as_str())
+        .await
+        .unwrap();
+
+    let children_before_fault = volume_children(volume.uuid()).await;
+    tracing::info!("volume children: {:?}", children_before_fault);
+
+    let missing_child = child.uri.to_string();
+    rpc_handle
+        .mayastor
+        .remove_child_nexus(rpc::mayastor::RemoveChildNexusRequest {
+            uuid: bad_nexus.uuid.to_string(),
+            uri: missing_child.clone(),
+        })
+        .await
+        .unwrap();
+
+    tracing::debug!(
+        "Nexus: {:?}",
+        rpc_handle
+            .mayastor
+            .list_nexus(rpc::mayastor::Null {})
+            .await
+            .unwrap()
+    );
+
+    // There no easy way to check for a negative here, just wait for 2 garbage reconcilers.
+    wait_till_volume_status_timeout(
+        &cluster,
+        volume.uuid(),
+        models::VolumeStatus::Faulted,
+        reconcile_period * 10,
+    )
+    .await
+    .expect_err("Should not get faulted!");
+}

From e4edf5d9a3c7dd36fca03781822ce5c2b95b1104 Mon Sep 17 00:00:00 2001
From: Tiago Castro <tiagolobocastro@gmail.com>
Date: Fri, 14 Jul 2023 19:54:07 +0100
Subject: [PATCH 2/3] fix(nexus/replica): check with replica owner before
 destroying it

A user hit a very weird situation where there were 2 created nexuses containing the same replica.
How can this happen? A potential situation is fixed where we now collect volume state AFTER we
get the volume guard, though it's a very tight race so I suspect something else might still
be at play here..

Irregardless of how this can happen we now plug the hole by ensuring we always check wit the
volume replica removal logic before attempting to disown and destroy a replica.

Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com>
---
 .../src/core/reconciler/volume/hot_spare.rs   | 15 +++---
 control-plane/agents/core/src/nexus/specs.rs  | 46 +++++++------------
 .../agents/core/src/volume/registry.rs        | 11 +++++
 control-plane/agents/core/src/volume/specs.rs | 33 ++++++-------
 4 files changed, 49 insertions(+), 56 deletions(-)

diff --git a/control-plane/agents/core/src/core/reconciler/volume/hot_spare.rs b/control-plane/agents/core/src/core/reconciler/volume/hot_spare.rs
index bceb67b11..9a35dd932 100644
--- a/control-plane/agents/core/src/core/reconciler/volume/hot_spare.rs
+++ b/control-plane/agents/core/src/core/reconciler/volume/hot_spare.rs
@@ -48,21 +48,24 @@ async fn hot_spare_reconcile(
     volume_spec: &Arc<Mutex<VolumeSpec>>,
     context: &PollContext,
 ) -> PollResult {
-    let uuid = volume_spec.lock().uuid.clone();
-    let volume_state = context.registry().get_volume_state(&uuid).await?;
     let _guard = match volume_spec.operation_guard(OperationMode::ReconcileStart) {
         Ok(guard) => guard,
         Err(_) => return PollResult::Ok(PollerState::Busy),
     };
-    let mode = OperationMode::ReconcileStep;
-
-    if !volume_spec.lock().policy.self_heal {
+    let volume_spec_cln = volume_spec.lock().clone();
+    if !volume_spec_cln.policy.self_heal {
         return PollResult::Ok(PollerState::Idle);
     }
-    if !volume_spec.lock().status.created() {
+    if !volume_spec_cln.status.created() {
         return PollResult::Ok(PollerState::Idle);
     }
 
+    let volume_state = context
+        .registry()
+        .get_volume_spec_state(volume_spec_cln)
+        .await?;
+    let mode = OperationMode::ReconcileStep;
+
     match volume_state.status {
         VolumeStatus::Online => volume_replica_count_reconciler(volume_spec, context, mode).await,
         VolumeStatus::Unknown | VolumeStatus::Degraded => {
diff --git a/control-plane/agents/core/src/nexus/specs.rs b/control-plane/agents/core/src/nexus/specs.rs
index 006a4bb20..01ed35418 100644
--- a/control-plane/agents/core/src/nexus/specs.rs
+++ b/control-plane/agents/core/src/nexus/specs.rs
@@ -5,7 +5,7 @@ use crate::core::{
 };
 use common::errors::{NexusNotFound, SvcError};
 use common_lib::{
-    mbus_api::{ErrorChain, ResourceKind},
+    mbus_api::ResourceKind,
     types::v0::{
         message_bus::{
             AddNexusChild, AddNexusReplica, Child, ChildUri, CreateNexus, DestroyNexus, Nexus,
@@ -16,7 +16,7 @@ use common_lib::{
             nexus::{NexusOperation, NexusSpec},
             nexus_child::NexusChild,
             replica::ReplicaSpec,
-            OperationMode, SpecStatus, SpecTransaction, TraceSpan,
+            OperationMode, SpecStatus, SpecTransaction,
         },
     },
 };
@@ -433,37 +433,23 @@ impl ResourceSpecsLocked {
                                 .ok_or(SvcError::ReplicaNotFound {
                                     replica_id: replica.uuid().clone(),
                                 })?;
-                        let pool_id = replica_spec.lock().pool.clone();
-                        match Self::get_pool_node(registry, pool_id).await {
-                            Some(node) => {
-                                if let Err(error) = self
-                                    .disown_and_destroy_replica(registry, &node, replica.uuid())
-                                    .await
-                                {
-                                    nexus_spec.lock().clone().error_span(|| {
-                                        tracing::error!(
-                                            replica.uuid = %replica.uuid(),
-                                            error = %error.full_string(),
-                                            "Failed to disown and destroy the replica"
-                                        )
-                                    });
-                                }
-                            }
-                            None => {
-                                // The replica node was not found (perhaps because it is offline).
-                                // The replica can't be destroyed because the node isn't there.
-                                // Instead, disown the replica from the volume and let the garbage
-                                // collector destroy it later.
-                                nexus_spec.lock().clone().warn_span(|| {
-                                    tracing::warn!(
-                                        replica.uuid = %replica.uuid(),
-                                        "Failed to find the node where the replica is hosted"
+                        if !replica_spec.lock().owners.owned_by_a_nexus() {
+                            let owner_volume = {
+                                let replica_spec = replica_spec.lock();
+                                replica_spec.owners.volume().cloned()
+                            };
+                            if let Some(owner) = owner_volume {
+                                if let Some(volume) = self.get_locked_volume(&owner) {
+                                    self.remove_unused_volume_replica(
+                                        registry,
+                                        &volume,
+                                        replica.uuid(),
+                                        mode,
                                     )
-                                });
-                                let _ = self.disown_volume_replica(registry, &replica_spec).await;
+                                    .await?;
+                                }
                             }
                         }
-
                         Ok(())
                     }
                     result => result,
diff --git a/control-plane/agents/core/src/volume/registry.rs b/control-plane/agents/core/src/volume/registry.rs
index df8b5cc75..7946377e6 100644
--- a/control-plane/agents/core/src/volume/registry.rs
+++ b/control-plane/agents/core/src/volume/registry.rs
@@ -22,6 +22,17 @@ impl Registry {
             .await
     }
 
+    /// Get the volume state for the specified volume spec.
+    pub(crate) async fn get_volume_spec_state(
+        &self,
+        volume_spec: VolumeSpec,
+    ) -> Result<VolumeState, SvcError> {
+        let replica_specs = self.specs().get_cloned_volume_replicas(&volume_spec.uuid);
+
+        self.get_volume_state_with_replicas(&volume_spec, &replica_specs)
+            .await
+    }
+
     /// Get the volume state for the specified volume
     #[tracing::instrument(level = "info", skip(self, volume_spec, replicas))]
     pub(crate) async fn get_volume_state_with_replicas(
diff --git a/control-plane/agents/core/src/volume/specs.rs b/control-plane/agents/core/src/volume/specs.rs
index 9b7ad39af..a1d032a14 100644
--- a/control-plane/agents/core/src/volume/specs.rs
+++ b/control-plane/agents/core/src/volume/specs.rs
@@ -655,6 +655,8 @@ impl ResourceSpecsLocked {
                 {
                     Ok(_) => Ok(()),
                     Err(error) if !request.force() => Err(error),
+                    Err(error @ SvcError::Conflict {}) => Err(error),
+                    Err(error @ SvcError::StoreSave { .. }) => Err(error),
                     Err(error) => {
                         let node_online = match registry.get_node_wrapper(&nexus_clone.node).await {
                             Ok(node) => {
@@ -1053,6 +1055,17 @@ impl ResourceSpecsLocked {
             .await
         {
             Ok(_) => Ok(()),
+            Err(SvcError::GrpcRequestError {
+                source,
+                request,
+                resource,
+            }) if source.code() == tonic::Code::DeadlineExceeded => {
+                Err(SvcError::GrpcRequestError {
+                    source,
+                    request,
+                    resource,
+                })
+            }
             Err(error) => {
                 if let Some(replica) = self.get_replica(&replica.uuid) {
                     let mut replica = replica.lock();
@@ -1333,26 +1346,6 @@ impl ResourceSpecsLocked {
         .await
     }
 
-    /// Disown and destroy the replica from its volume
-    pub(crate) async fn disown_and_destroy_replica(
-        &self,
-        registry: &Registry,
-        node: &NodeId,
-        replica_uuid: &ReplicaId,
-    ) -> Result<(), SvcError> {
-        if let Some(replica) = self.get_replica(replica_uuid) {
-            // disown it from the volume first, so at the very least it can be garbage collected
-            // at a later point if the node is not accessible
-            self.disown_volume_replica(registry, &replica).await?;
-            self.destroy_volume_replica(registry, Some(node), &replica)
-                .await
-        } else {
-            Err(SvcError::ReplicaNotFound {
-                replica_id: replica_uuid.to_owned(),
-            })
-        }
-    }
-
     /// Remove volume by its `id`
     pub(super) fn remove_volume(&self, id: &VolumeId) {
         let mut specs = self.write();

From b2ce7c012ec8c1b71a1cb0ea51cb2fe7cbe26a57 Mon Sep 17 00:00:00 2001
From: Tiago Castro <tiagolobocastro@gmail.com>
Date: Mon, 17 Jul 2023 16:09:36 +0100
Subject: [PATCH 3/3] ci: add fake submodule check to please gha/bors

Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com>
---
 .github/workflows/pr-submodule-branch.yml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 .github/workflows/pr-submodule-branch.yml

diff --git a/.github/workflows/pr-submodule-branch.yml b/.github/workflows/pr-submodule-branch.yml
new file mode 100644
index 000000000..373678190
--- /dev/null
+++ b/.github/workflows/pr-submodule-branch.yml
@@ -0,0 +1,16 @@
+name: Submodule Branch Check
+on:
+  pull_request:
+    types: ['opened', 'edited', 'reopened', 'synchronize']
+  push:
+    branches:
+      - 'release/**'
+      - staging
+jobs:
+  submodule-branch:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Check root submodules branch
+        run: echo "Compat pass"
+