Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: [MR-581] Add functionalities of creating and removing unverified checkpoint markers #657

Merged
merged 24 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
0caa26e
write and remove unverified markers
ShuoWangNSL Jul 29, 2024
e788011
remove redundant usages of checkpoint_verified()
ShuoWangNSL Jul 29, 2024
b46c8ea
fix checkpoint_verified() in tests, do_list, do_import_state state tools
ShuoWangNSL Jul 29, 2024
3ea61cb
test manifest computation
ShuoWangNSL Jul 31, 2024
cabc21a
Apply suggestions from code review
ShuoWangNSL Aug 6, 2024
d11889e
fix debug_assertion
ShuoWangNSL Aug 2, 2024
d55d528
apply reviews
ShuoWangNSL Aug 6, 2024
2160ee9
Merge branch 'master' into shuo/unverified_marker
ShuoWangNSL Aug 6, 2024
95fba0c
control debug_assertion in manifest compuation tests
ShuoWangNSL Aug 6, 2024
540a213
Merge branch 'master' into shuo/unverified_marker
ShuoWangNSL Aug 7, 2024
18f6e9b
fix checkpoint() from new tests
ShuoWangNSL Aug 8, 2024
ffdd9df
compute_manifest ignores marker in tests and external tools but handl…
ShuoWangNSL Aug 8, 2024
57e362b
group functions into load_checkpoint_parallel_and_mark_verified
ShuoWangNSL Aug 8, 2024
6fd5e0d
reviews
ShuoWangNSL Aug 8, 2024
d8538a9
clippy
ShuoWangNSL Aug 8, 2024
8b5ae4c
Merge branch 'master' into shuo/unverified_marker
ShuoWangNSL Aug 8, 2024
7d603b1
Merge branch 'master' into shuo/unverified_marker
ShuoWangNSL Aug 13, 2024
0da293d
apply reviews
ShuoWangNSL Aug 13, 2024
64184b7
can_keep_latest_verified_checkpoint_after_removal_with_unverified_che…
ShuoWangNSL Aug 13, 2024
82ba93a
checkpoints_to_keep could possibly be unverified ones
ShuoWangNSL Aug 13, 2024
d38d125
test handle_compute_manifest_request should panic
ShuoWangNSL Aug 14, 2024
3cebb09
Merge branch 'master' into shuo/unverified_marker
ShuoWangNSL Aug 14, 2024
b5f7bf5
refine comments
ShuoWangNSL Aug 14, 2024
a756087
Merge branch 'master' into shuo/unverified_marker
ShuoWangNSL Aug 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rs/replicated_state/src/page_map/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ impl Storage {
let mut base_overlays = Vec::<OverlayFile>::new();
let mut overlays = Vec::<OverlayFile>::new();
for path in overlay_paths.iter() {
let overlay = OverlayFile::load(path).unwrap();
let overlay = OverlayFile::load(path)?;
let start_page_index = overlay
.index_iter()
.next()
Expand Down
6 changes: 6 additions & 0 deletions rs/state_layout/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ pub enum LayoutError {

/// Trying to remove the latest checkpoint.
LatestCheckpoint(Height),

/// Checkpoint at the specified height is unverified.
CheckpointUnverified(Height),
}

impl fmt::Display for LayoutError {
Expand Down Expand Up @@ -57,6 +60,9 @@ impl fmt::Display for LayoutError {
"Trying to remove the latest checkpoint at height @{}",
height
),
Self::CheckpointUnverified(height) => {
write!(f, "Checkpoint @{} is not verified", height)
}
}
}
}
Expand Down
124 changes: 113 additions & 11 deletions rs/state_layout/src/state_layout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,13 @@ impl TipHandler {
if path.extension() == Some(OsStr::new("pbuf")) {
// Do not copy protobufs.
CopyInstruction::Skip
} else if path == cp.unverified_checkpoint_marker() {
// With LSMT enabled, the unverified checkpoint marker should already be removed at this point.
debug_assert_eq!(lsmt_storage, FlagStatus::Disabled);
// With LSMT disabled, the unverified checkpoint marker is still present in the checkpoint at this point.
// We should not copy it back to the tip because it will be created later when promoting the tip as the next checkpoint.
// When we go for asynchronous checkpointing in the future, we should revisit this as the marker file will have a different lifespan.
CopyInstruction::Skip
} else if path.extension() == Some(OsStr::new("bin"))
&& lsmt_storage == FlagStatus::Disabled
&& !path.starts_with(cp.root.join(SNAPSHOTS_DIR))
Expand Down Expand Up @@ -514,7 +521,7 @@ impl StateLayout {
thread_pool: &mut Option<scoped_threadpool::Pool>,
) -> Result<(), LayoutError> {
for height in self.checkpoint_heights()? {
let path = self.checkpoint(height)?.raw_path().to_path_buf();
let path = self.checkpoint_verified(height)?.raw_path().to_path_buf();
sync_and_mark_files_readonly(&self.log, &path, &self.metrics, thread_pool.as_mut())
.map_err(|err| LayoutError::IoError {
path,
Expand Down Expand Up @@ -666,7 +673,7 @@ impl StateLayout {
message: "Could not sync checkpoints".to_string(),
io_err: err,
})?;
self.checkpoint(height)
self.checkpoint_in_verification(height)
}

pub fn clone_checkpoint(&self, from: Height, to: Height) -> Result<(), LayoutError> {
Expand All @@ -687,9 +694,9 @@ impl StateLayout {
Ok(())
}

/// Returns the layout of the checkpoint with the given height (if
/// there is one).
pub fn checkpoint(&self, height: Height) -> Result<CheckpointLayout<ReadOnly>, LayoutError> {
/// Returns the layout of the checkpoint with the given height.
/// If the checkpoint is not found, an error is returned.
fn checkpoint(&self, height: Height) -> Result<CheckpointLayout<ReadOnly>, LayoutError> {
let cp_name = Self::checkpoint_name(height);
let path = self.checkpoints().join(cp_name);
if !path.exists() {
Expand Down Expand Up @@ -721,17 +728,41 @@ impl StateLayout {
CheckpointLayout::new(path, height, self.clone())
}

/// Returns the untracked `CheckpointLayout` for the given height (if there is one).
pub fn checkpoint_untracked(
/// Returns the layout of a verified checkpoint with the given height.
/// If the checkpoint is not found or is not verified, an error is returned.
pub fn checkpoint_verified(
&self,
height: Height,
) -> Result<CheckpointLayout<ReadOnly>, LayoutError> {
let cp = self.checkpoint(height)?;
if !cp.is_checkpoint_verified() {
return Err(LayoutError::CheckpointUnverified(height));
};
Ok(cp)
}

/// Returns the layout of a checkpoint with the given height that is in the verification process.
/// If the checkpoint is not found, an error is returned.
///
/// Note that the unverified marker file may already be removed from the checkpoint by another verification process.
/// This method does not require that the marker file exists.
pub fn checkpoint_in_verification(
&self,
height: Height,
) -> Result<CheckpointLayout<ReadOnly>, LayoutError> {
self.checkpoint(height)
}

/// Returns if a checkpoint with the given height is verified or not.
/// If the checkpoint is not found, an error is returned.
pub fn checkpoint_verification_status(&self, height: Height) -> Result<bool, LayoutError> {
let cp_name = Self::checkpoint_name(height);
let path = self.checkpoints().join(cp_name);
if !path.exists() {
return Err(LayoutError::NotFound(height));
}
CheckpointLayout::new_untracked(path, height)
let cp = CheckpointLayout::<ReadOnly>::new_untracked(path, height)?;
ShuoWangNSL marked this conversation as resolved.
Show resolved Hide resolved
Ok(cp.is_checkpoint_verified())
}

fn increment_checkpoint_ref_counter(&self, height: Height) {
Expand Down Expand Up @@ -779,14 +810,24 @@ impl StateLayout {
}
}

/// Returns a sorted list of `Height`s for which a checkpoint is available.
/// Returns a sorted list of `Height`s for which a checkpoint is available and verified.
pub fn checkpoint_heights(&self) -> Result<Vec<Height>, LayoutError> {
let checkpoint_heights = self
.unfiltered_checkpoint_heights()?
.into_iter()
.filter(|h| self.checkpoint_verification_status(*h).unwrap_or(false))
.collect();

Ok(checkpoint_heights)
}

/// Returns a sorted list of `Height`s for which a checkpoint is available, regardless of verification status.
pub fn unfiltered_checkpoint_heights(&self) -> Result<Vec<Height>, LayoutError> {
let names = dir_file_names(&self.checkpoints()).map_err(|err| LayoutError::IoError {
path: self.checkpoints(),
message: format!("Failed to get all checkpoints (err kind: {:?})", err.kind()),
io_err: err,
})?;

parse_and_sort_checkpoint_heights(&names[..])
}

Expand Down Expand Up @@ -902,7 +943,7 @@ impl StateLayout {
if heights.is_empty() {
error!(
self.log,
"Trying to remove non-existing checkpoint {}. The CheckpoinLayout was invalid",
"Trying to remove non-existing checkpoint {}. The CheckpointLayout was invalid",
height,
);
self.metrics
Expand Down Expand Up @@ -1495,6 +1536,67 @@ impl<Permissions: AccessPolicy> CheckpointLayout<Permissions> {
pub fn raw_path(&self) -> &Path {
&self.root
}

/// Returns if the checkpoint is marked as unverified or not.
pub fn is_checkpoint_verified(&self) -> bool {
!self.unverified_checkpoint_marker().exists()
}
}

impl<P> CheckpointLayout<P>
where
P: WritePolicy,
{
/// Creates the unverified checkpoint marker.
/// If the marker already exists, this function does nothing and returns `Ok(())`.
///
/// Only the checkpoint layout with write policy can create the unverified checkpoint marker,
/// e.g. state sync scratchpad and tip.
pub fn create_unverified_checkpoint_marker(&self) -> Result<(), LayoutError> {
let marker = self.unverified_checkpoint_marker();
if marker.exists() {
return Ok(());
}
open_for_write(&marker)?;
sync_path(&self.root).map_err(|err| LayoutError::IoError {
path: self.root.clone(),
message: "Failed to sync checkpoint directory for the creation of the unverified checkpoint marker".to_string(),
io_err: err,
})
}
}

impl CheckpointLayout<ReadOnly> {
schneiderstefan marked this conversation as resolved.
Show resolved Hide resolved
/// Removes the unverified checkpoint marker.
/// If the marker does not exist, this function does nothing and returns `Ok(())`.
///
/// A readonly checkpoint typically prevents modification to the files in the checkpoint.
/// However, the removal of the unverified checkpoint marker is allowed as
/// the marker is not part the checkpoint conceptually.
pub fn remove_unverified_checkpoint_marker(&self) -> Result<(), LayoutError> {
let marker = self.unverified_checkpoint_marker();
if !marker.exists() {
return Ok(());
}
match std::fs::remove_file(&marker) {
Err(err) if err.kind() != std::io::ErrorKind::NotFound => {
return Err(LayoutError::IoError {
path: marker.to_path_buf(),
message: "failed to remove file from disk".to_string(),
io_err: err,
});
}
_ => {}
}

// Sync the directory to make sure the marker is removed from disk.
// This is strict prerequisite for the manifest computation.
sync_path(&self.root).map_err(|err| LayoutError::IoError {
path: self.root.clone(),
message: "Failed to sync checkpoint directory for the creation of the unverified checkpoint marker".to_string(),
io_err: err,
})
}
}

pub struct PageMapLayout<Permissions: AccessPolicy> {
Expand Down
46 changes: 46 additions & 0 deletions rs/state_layout/src/state_layout/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,52 @@ fn test_last_removal_panics_in_debug() {
});
}

#[test]
ShuoWangNSL marked this conversation as resolved.
Show resolved Hide resolved
fn test_can_remove_unverified_marker_file_twice() {
with_test_replica_logger(|log| {
let tempdir = tmpdir("state_layout");
let root_path = tempdir.path().to_path_buf();
let metrics_registry = ic_metrics::MetricsRegistry::new();
let state_layout = StateLayout::try_new(log, root_path, &metrics_registry).unwrap();

let height = Height::new(1);
let state_sync_scratchpad = state_layout.state_sync_scratchpad(height).unwrap();
let scratchpad_layout =
CheckpointLayout::<RwPolicy<()>>::new_untracked(state_sync_scratchpad, height)
.expect("failed to create checkpoint layout");
// Create at least a file in the scratchpad layout. Otherwise, empty folders can be overridden without errors
// and calling "scratchpad_to_checkpoint" twice will not fail as expected.
File::create(scratchpad_layout.raw_path().join(SYSTEM_METADATA_FILE)).unwrap();

let tip_path = state_layout.tip_path();
let tip = CheckpointLayout::<RwPolicy<()>>::new_untracked(tip_path, height)
.expect("failed to create tip layout");
File::create(tip.raw_path().join(SYSTEM_METADATA_FILE)).unwrap();

// Create marker files in both the scratchpad and tip and try to promote them to a checkpoint.
scratchpad_layout
.create_unverified_checkpoint_marker()
.unwrap();
tip.create_unverified_checkpoint_marker().unwrap();

let checkpoint = state_layout
.scratchpad_to_checkpoint(scratchpad_layout, height, None)
.unwrap();
checkpoint.remove_unverified_checkpoint_marker().unwrap();

// The checkpoint already exists, therefore promoting the tip to checkpoint should fail.
// However, it can still access the checkpoint and try to remove the marker file again from its perspective.
let checkpoint_result = state_layout.scratchpad_to_checkpoint(tip, height, None);
assert!(checkpoint_result.is_err());

let res = state_layout
.checkpoint(height)
.unwrap()
.remove_unverified_checkpoint_marker();
assert!(res.is_ok());
});
}

#[test]
fn test_canister_id_from_path() {
assert_eq!(
Expand Down
3 changes: 2 additions & 1 deletion rs/state_machine_tests/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2356,11 +2356,12 @@ impl StateMachine {
CertificationScope::Full,
None,
);
self.state_manager.flush_tip_channel();

other_env.import_canister_state(
ShuoWangNSL marked this conversation as resolved.
Show resolved Hide resolved
self.state_manager
.state_layout()
.checkpoint(height)
.checkpoint_verified(height)
.unwrap()
.canister(&canister_id)
.unwrap()
Expand Down
17 changes: 17 additions & 0 deletions rs/state_manager/src/checkpoint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ pub(crate) fn make_checkpoint(
)?
};

cp.remove_unverified_checkpoint_marker()?;

ShuoWangNSL marked this conversation as resolved.
Show resolved Hide resolved
Ok((cp, state, has_downgrade))
}

Expand All @@ -159,6 +161,21 @@ pub fn load_checkpoint_parallel(
)
}

/// Calls [load_checkpoint_parallel] and removes the unverified checkpoint marker.
/// This combination is useful when marking a checkpoint as verified immediately after a successful loading.
pub fn load_checkpoint_parallel_and_mark_verified(
checkpoint_layout: &CheckpointLayout<ReadOnly>,
own_subnet_type: SubnetType,
metrics: &CheckpointMetrics,
fd_factory: Arc<dyn PageAllocatorFileDescriptor>,
) -> Result<ReplicatedState, CheckpointError> {
let state = load_checkpoint_parallel(checkpoint_layout, own_subnet_type, metrics, fd_factory)?;
checkpoint_layout
.remove_unverified_checkpoint_marker()
.map_err(CheckpointError::from)?;
Ok(state)
}

/// Loads the node state heighted with `height` using the specified
/// directory layout.
pub fn load_checkpoint(
Expand Down
Loading
Loading