Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bootstrap PP from a SnapshotRepository snapshot #2288

Draft
wants to merge 2 commits into
base: feat/snapshot-upload
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions crates/core/src/task_center.rs
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,23 @@ impl TaskCenter {
.spawn_blocking(move || tc.block_on(name, partition_id, future))
}

// Spawn a function on a blocking thread.
pub fn spawn_blocking_fn_unmanaged<F, O>(
&self,
name: &'static str,
partition_id: Option<PartitionId>,
f: F,
) -> tokio::task::JoinHandle<O>
where
F: FnOnce() -> O + Send + 'static,
O: Send + 'static,
{
let tc = self.clone();
self.inner
.default_runtime_handle
.spawn_blocking(move || tc.run_in_scope_sync(name, partition_id, f))
}

/// Cancelling the child will not cancel the parent. Note that parent task will not
/// wait for children tasks. The parent task is allowed to finish before children.
#[track_caller]
Expand Down
2 changes: 1 addition & 1 deletion crates/worker/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ tempfile = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
tokio-util = { workspace = true }
tokio-util = { workspace = true, features = ["io-util"] }
tracing = { workspace = true }
tracing-opentelemetry = { workspace = true }
url = { workspace = true }
Expand Down
13 changes: 5 additions & 8 deletions crates/worker/src/partition/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ mod cleaner;
pub mod invoker_storage_reader;
mod leadership;
pub mod shuffle;
mod snapshots;
pub mod snapshots;
mod state_machine;
pub mod types;

Expand Down Expand Up @@ -148,6 +148,7 @@ where
bifrost: Bifrost,
mut partition_store: PartitionStore,
configuration: Live<Configuration>,
snapshot_repository: SnapshotRepository,
) -> Result<PartitionProcessor<Codec, InvokerInputSender>, StorageError> {
let PartitionProcessorBuilder {
partition_id,
Expand Down Expand Up @@ -197,13 +198,9 @@ where
last_seen_leader_epoch,
);

let config = configuration.pinned();
let snapshot_producer = SnapshotProducer::create(
partition_store.clone(),
configuration,
SnapshotRepository::create(config.common.base_dir(), &config.worker.snapshots).await?,
)
.await?;
let snapshot_producer =
SnapshotProducer::create(partition_store.clone(), configuration, snapshot_repository)
.await?;

Ok(PartitionProcessor {
task_center,
Expand Down
98 changes: 86 additions & 12 deletions crates/worker/src/partition/snapshots/repository.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ use aws_credential_types::provider::ProvideCredentials;
use object_store::aws::AmazonS3Builder;
use object_store::{ObjectStore, PutPayload};
use tempfile::NamedTempFile;
use tracing::{debug, trace_span, warn};
use tokio_util::io::{StreamReader, SyncIoBridge};
use tracing::{debug, trace, trace_span, warn};
use url::Url;

use restate_core::task_center;
use restate_partition_store::snapshots::PartitionSnapshotMetadata;
use restate_partition_store::snapshots::{LocalPartitionSnapshot, PartitionSnapshotMetadata};
use restate_types::config::SnapshotsOptions;
use restate_types::identifiers::PartitionId;

Expand Down Expand Up @@ -89,7 +90,7 @@ impl SnapshotRepository {
object_store,
destination,
prefix,
staging_path: base_dir.clone(),
staging_path: base_dir.clone().join("snapshot-staging"),
})
}

Expand All @@ -115,20 +116,16 @@ impl SnapshotRepository {
// reverse order. We inject an explicit sort key into the snapshot prefix to make sure that
// the latest snapshot is always first.
let inverted_sort_key = format!("{:016x}", u64::MAX - lsn.as_u64());
let key = format!(
"{partition_id}/{sk}/{snapshot_id}_{lsn}.tar",
sk = inverted_sort_key,
);

// The snapshot data / metadata key format is: [<base_prefix>/]<partition_id>/<sort_key>/<snapshot_id>_<lsn>.tar
// The snapshot data / metadata key format is: [<base_prefix>/]<partition_id>/<sort_key>_<lsn>_<snapshot_id>.tar
let snapshot_key = match self.prefix.as_str() {
"" | "/" => format!(
"{partition_id}/{sk}/{snapshot_id}_{lsn}.tar",
"{partition_id}/{sk}_{lsn}_{snapshot_id}.tar",
sk = inverted_sort_key,
lsn = metadata.min_applied_lsn,
),
prefix => format!(
"{trimmed_prefix}/{partition_id}/{sk}/{snapshot_id}_{lsn}.tar",
"{trimmed_prefix}/{partition_id}/{sk}_{lsn}_{snapshot_id}.tar",
trimmed_prefix = prefix.trim_start_matches('/').trim_end_matches('/'),
sk = inverted_sort_key,
),
Expand Down Expand Up @@ -160,18 +157,95 @@ impl SnapshotRepository {

let upload = self
.object_store
.put(&object_store::path::Path::from(snapshot_key), payload)
.put(
&object_store::path::Path::from(snapshot_key.clone()),
payload,
)
.await
.context("Failed to put snapshot in repository")?;

debug!(
%snapshot_id,
etag = upload.e_tag.unwrap_or_default(),
"Successfully published snapshot to repository as: {}",
key,
snapshot_key,
);
Ok(())
}

pub(crate) async fn find_latest(
&self,
partition_id: PartitionId,
) -> anyhow::Result<Option<LocalPartitionSnapshot>> {
let list_prefix = match self.prefix.as_str() {
"" | "/" => format!("{}/", partition_id),
prefix => format!("{}/{}/", prefix, partition_id),
};
let list_prefix = object_store::path::Path::from(list_prefix.as_str());

let list = self
.object_store
.list_with_delimiter(Some(&list_prefix))
.await?;

let latest = list.objects.first();

let Some(snapshot_entry) = latest else {
debug!(%partition_id, "No snapshots found in the snapshots repository");
return Ok(None);
};

let snapshot_object = self
.object_store
.get(&snapshot_entry.location)
.await
.context("Failed to get snapshot from repository")?;

// construct the bridge in a Tokio context, before moving to blocking pool
let snapshot_reader = SyncIoBridge::new(StreamReader::new(snapshot_object.into_stream()));

let snapshot_name = snapshot_entry.location.filename().expect("has a name");
let snapshot_base_path = &self.staging_path.join(snapshot_name);
tokio::fs::create_dir_all(snapshot_base_path).await?;

let snapshot_dir = snapshot_base_path.clone();
trace!(%partition_id, "Unpacking snapshot {} to: {:?}", snapshot_entry.location, snapshot_dir);
task_center()
.spawn_blocking_fn_unmanaged("unpack-snapshot", Some(partition_id), move || {
let mut tarball = tar::Archive::new(snapshot_reader);
for file in tarball.entries()? {
let mut file = file?;
trace!("Unpacking snapshot file: {:?}", file.header().path()?);
file.unpack_in(&snapshot_dir)?;
}
Ok::<(), anyhow::Error>(())
})
.await??;
Comment on lines +213 to +223
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially tried to use spawn_blocking_unmanaged here but got the following error:

"Cannot start a runtime from within a runtime. This happens because a function (like block_on) attempted to block the current thread while the thread is being used to drive asynchronous tasks."

This is not a problem in the earlier use in this file - on line 136, we spawn a future which also uses the tar crate to bundle up a local directory on the upload path, and that works just fine. The only difference here appears to be the use of SyncIoBridge to make the AsyncRead byte stream of the download into a Read for tar's Archive to consume.

I was also able to get this to work using plain vanilla tokio::task::spawn_blocking. It seems like the TaskCenter implementation using block_on is part of this but I'm not entirely sure what's going on.

self.inner
.default_runtime_handle
.block_on(self.run_in_scope(name, partition_id, future))


let metadata = tokio::fs::read(snapshot_base_path.join("metadata.json")).await?;
let mut metadata: PartitionSnapshotMetadata = serde_json::from_slice(metadata.as_slice())?;

// Patch the file paths in the snapshot metadata to point to the correct staging directory on the local node.
let snapshot_base_path = snapshot_base_path
.to_path_buf()
.into_os_string()
.into_string()
.map_err(|path| anyhow::anyhow!("Invalid string: {:?}", path))?
.trim_end_matches('/')
.to_owned();
metadata
.files
.iter_mut()
.for_each(|f| f.directory = snapshot_base_path.clone());
trace!(%partition_id, "Restoring from snapshot metadata: {:?}", metadata);

Ok(Some(LocalPartitionSnapshot {
base_dir: self.staging_path.clone(),
min_applied_lsn: metadata.min_applied_lsn,
db_comparator_name: metadata.db_comparator_name,
files: metadata.files,
}))
}
}

#[derive(Debug)]
Expand Down
68 changes: 59 additions & 9 deletions crates/worker/src/partition_processor_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ use crate::metric_definitions::PARTITION_LAST_PERSISTED_LOG_LSN;
use crate::metric_definitions::PARTITION_TIME_SINCE_LAST_RECORD;
use crate::metric_definitions::PARTITION_TIME_SINCE_LAST_STATUS_UPDATE;
use crate::partition::invoker_storage_reader::InvokerStorageReader;
use crate::partition::snapshots::SnapshotRepository;
use crate::partition::PartitionProcessorControlCommand;
use crate::PartitionProcessorBuilder;

Expand Down Expand Up @@ -1124,6 +1125,9 @@ impl SpawnPartitionProcessorTask {
invoker.handle(),
);

let snapshot_repository =
SnapshotRepository::create(config.common.base_dir(), &config.worker.snapshots).await?;

let invoker_name = Box::leak(Box::new(format!("invoker-{}", partition_id)));
let invoker_config = configuration.clone().map(|c| &c.worker.invoker);

Expand All @@ -1135,14 +1139,54 @@ impl SpawnPartitionProcessorTask {
{
let options = options.clone();
let key_range = key_range.clone();
let partition_store = partition_store_manager
.open_partition_store(
partition_id,
key_range,
OpenMode::CreateIfMissing,
&options.storage.rocksdb,
)
.await?;

let partition_store = if !partition_store_manager
.has_partition(pp_builder.partition_id)
.await
{
info!(
partition_id = %partition_id,
"Looking for store snapshot to bootstrap partition",
);
let snapshot = snapshot_repository.find_latest(partition_id).await?;
if let Some(snapshot) = snapshot {
info!(
partition_id = %partition_id,
"Found snapshot to bootstrap partition, restoring it",
);
partition_store_manager
.restore_partition_store_snapshot(
partition_id,
key_range.clone(),
snapshot,
&options.storage.rocksdb,
)
.await?
} else {
info!(
partition_id = %partition_id,
"No snapshot found to bootstrap partition, creating new store",
);
partition_store_manager
.open_partition_store(
partition_id,
key_range,
OpenMode::CreateIfMissing,
&options.storage.rocksdb,
)
.await?
}
} else {
partition_store_manager
.open_partition_store(
partition_id,
key_range,
OpenMode::OpenExisting,
&options.storage.rocksdb,
)
.await?
};

move || async move {
tc.spawn_child(
TaskKind::SystemService,
Expand All @@ -1152,7 +1196,13 @@ impl SpawnPartitionProcessorTask {
)?;

let err = pp_builder
.build::<ProtobufRawEntryCodec>(tc, bifrost, partition_store, configuration)
.build::<ProtobufRawEntryCodec>(
tc,
bifrost,
partition_store,
configuration,
snapshot_repository,
)
.await?
.run()
.await
Expand Down