Skip to content

Commit a459a9a

Browse files
Fix and test checkpoint sync from genesis (#7689)
Fix a bug involving checkpoint sync from genesis reported by Sunnyside labs. Ensure that the store's `anchor` is initialised prior to storing the genesis state. In the case of checkpoint sync from genesis, the genesis state will be in the _hot DB_, so we need the hot DB metadata to be initialised in order to store it. I've extended the existing checkpoint sync tests to cover this case as well. There are some subtleties around what the `state_upper_limit` should be set to in this case. I've opted to just enable state reconstruction from the start in the test so it gets set to 0, which results in an end state more consistent with the other test cases (full state reconstruction). This is required because we can't meaningfully do any state reconstruction when the split slot is 0 (there is no range of frozen slots to reconstruct).
1 parent fcc602a commit a459a9a

File tree

3 files changed

+154
-99
lines changed

3 files changed

+154
-99
lines changed

beacon_node/beacon_chain/src/builder.rs

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -514,9 +514,26 @@ where
514514
"Storing split from weak subjectivity state"
515515
);
516516

517-
// Set the store's split point *before* storing genesis so that genesis is stored
518-
// immediately in the freezer DB.
517+
// Set the store's split point *before* storing genesis so that if the genesis state
518+
// is prior to the split slot, it will immediately be stored in the freezer DB.
519519
store.set_split(weak_subj_slot, weak_subj_state_root, weak_subj_block_root);
520+
521+
// It is also possible for the checkpoint state to be equal to the genesis state, in which
522+
// case it will be stored in the hot DB. In this case, we need to ensure the store's anchor
523+
// is initialised prior to storing the state, as the anchor is required for working out
524+
// hdiff storage strategies.
525+
let retain_historic_states = self.chain_config.reconstruct_historic_states;
526+
self.pending_io_batch.push(
527+
store
528+
.init_anchor_info(
529+
weak_subj_block.parent_root(),
530+
weak_subj_block.slot(),
531+
weak_subj_slot,
532+
retain_historic_states,
533+
)
534+
.map_err(|e| format!("Failed to initialize anchor info: {:?}", e))?,
535+
);
536+
520537
let (_, updated_builder) = self.set_genesis_state(genesis_state)?;
521538
self = updated_builder;
522539

@@ -541,20 +558,6 @@ where
541558
"Stored frozen block roots at skipped slots"
542559
);
543560

544-
// Write the anchor to memory before calling `put_state` otherwise hot hdiff can't store
545-
// states that do not align with the `start_slot` grid.
546-
let retain_historic_states = self.chain_config.reconstruct_historic_states;
547-
self.pending_io_batch.push(
548-
store
549-
.init_anchor_info(
550-
weak_subj_block.parent_root(),
551-
weak_subj_block.slot(),
552-
weak_subj_slot,
553-
retain_historic_states,
554-
)
555-
.map_err(|e| format!("Failed to initialize anchor info: {:?}", e))?,
556-
);
557-
558561
// Write the state, block and blobs non-atomically, it doesn't matter if they're forgotten
559562
// about on a crash restart.
560563
store

beacon_node/beacon_chain/tests/store_tests.rs

Lines changed: 129 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -2281,6 +2281,19 @@ async fn weak_subjectivity_sync_skips_at_genesis() {
22812281
weak_subjectivity_sync_test(slots, checkpoint_slot).await
22822282
}
22832283

2284+
// Checkpoint sync from the genesis state.
2285+
//
2286+
// This is a regression test for a bug we had involving the storage of the genesis state in the hot
2287+
// DB.
2288+
#[tokio::test]
2289+
async fn weak_subjectivity_sync_from_genesis() {
2290+
let start_slot = 1;
2291+
let end_slot = E::slots_per_epoch() * 2;
2292+
let slots = (start_slot..end_slot).map(Slot::new).collect();
2293+
let checkpoint_slot = Slot::new(0);
2294+
weak_subjectivity_sync_test(slots, checkpoint_slot).await
2295+
}
2296+
22842297
async fn weak_subjectivity_sync_test(slots: Vec<Slot>, checkpoint_slot: Slot) {
22852298
// Build an initial chain on one harness, representing a synced node with full history.
22862299
let num_final_blocks = E::slots_per_epoch() * 2;
@@ -2367,7 +2380,15 @@ async fn weak_subjectivity_sync_test(slots: Vec<Slot>, checkpoint_slot: Slot) {
23672380
);
23682381
slot_clock.set_slot(harness.get_current_slot().as_u64());
23692382

2383+
let chain_config = ChainConfig {
2384+
// Set reconstruct_historic_states to true from the start in the genesis case. This makes
2385+
// some of the later checks more uniform across the genesis/non-genesis cases.
2386+
reconstruct_historic_states: checkpoint_slot == 0,
2387+
..ChainConfig::default()
2388+
};
2389+
23702390
let beacon_chain = BeaconChainBuilder::<DiskHarnessType<E>>::new(MinimalEthSpec, kzg)
2391+
.chain_config(chain_config)
23712392
.store(store.clone())
23722393
.custom_spec(test_spec::<E>().into())
23732394
.task_executor(harness.chain.task_executor.clone())
@@ -2381,7 +2402,6 @@ async fn weak_subjectivity_sync_test(slots: Vec<Slot>, checkpoint_slot: Slot) {
23812402
.store_migrator_config(MigratorConfig::default().blocking())
23822403
.slot_clock(slot_clock)
23832404
.shutdown_sender(shutdown_tx)
2384-
.chain_config(ChainConfig::default())
23852405
.event_handler(Some(ServerSentEventHandler::new_with_capacity(1)))
23862406
.execution_layer(Some(mock.el))
23872407
.rng(Box::new(StdRng::seed_from_u64(42)))
@@ -2449,96 +2469,118 @@ async fn weak_subjectivity_sync_test(slots: Vec<Slot>, checkpoint_slot: Slot) {
24492469
assert_eq!(state.update_tree_hash_cache().unwrap(), state_root);
24502470
}
24512471

2452-
// Forwards iterator from 0 should fail as we lack blocks.
2453-
assert!(matches!(
2454-
beacon_chain.forwards_iter_block_roots(Slot::new(0)),
2455-
Err(BeaconChainError::HistoricalBlockOutOfRange { .. })
2456-
));
2457-
2458-
// Simulate processing of a `StatusMessage` with an older finalized epoch by calling
2459-
// `block_root_at_slot` with an old slot for which we don't know the block root. It should
2460-
// return `None` rather than erroring.
2461-
assert_eq!(
2462-
beacon_chain
2463-
.block_root_at_slot(Slot::new(1), WhenSlotSkipped::None)
2464-
.unwrap(),
2465-
None
2466-
);
2472+
if checkpoint_slot != 0 {
2473+
// Forwards iterator from 0 should fail as we lack blocks (unless checkpoint slot is 0).
2474+
assert!(matches!(
2475+
beacon_chain.forwards_iter_block_roots(Slot::new(0)),
2476+
Err(BeaconChainError::HistoricalBlockOutOfRange { .. })
2477+
));
2478+
} else {
2479+
assert_eq!(
2480+
beacon_chain
2481+
.forwards_iter_block_roots(Slot::new(0))
2482+
.unwrap()
2483+
.next()
2484+
.unwrap()
2485+
.unwrap(),
2486+
(wss_block_root, Slot::new(0))
2487+
);
2488+
}
24672489

2468-
// Simulate querying the API for a historic state that is unknown. It should also return
2469-
// `None` rather than erroring.
2470-
assert_eq!(beacon_chain.state_root_at_slot(Slot::new(1)).unwrap(), None);
2490+
// The checks in this block only make sense if some data is missing as a result of the
2491+
// checkpoint sync, i.e. if we are not just checkpoint syncing from genesis.
2492+
if checkpoint_slot != 0 {
2493+
// Simulate processing of a `StatusMessage` with an older finalized epoch by calling
2494+
// `block_root_at_slot` with an old slot for which we don't know the block root. It should
2495+
// return `None` rather than erroring.
2496+
assert_eq!(
2497+
beacon_chain
2498+
.block_root_at_slot(Slot::new(1), WhenSlotSkipped::None)
2499+
.unwrap(),
2500+
None
2501+
);
24712502

2472-
// Supply blocks backwards to reach genesis. Omit the genesis block to check genesis handling.
2473-
let historical_blocks = chain_dump[..wss_block.slot().as_usize()]
2474-
.iter()
2475-
.filter(|s| s.beacon_block.slot() != 0)
2476-
.map(|s| s.beacon_block.clone())
2477-
.collect::<Vec<_>>();
2503+
// Simulate querying the API for a historic state that is unknown. It should also return
2504+
// `None` rather than erroring.
2505+
assert_eq!(beacon_chain.state_root_at_slot(Slot::new(1)).unwrap(), None);
2506+
2507+
// Supply blocks backwards to reach genesis. Omit the genesis block to check genesis handling.
2508+
let historical_blocks = chain_dump[..wss_block.slot().as_usize()]
2509+
.iter()
2510+
.filter(|s| s.beacon_block.slot() != 0)
2511+
.map(|s| s.beacon_block.clone())
2512+
.collect::<Vec<_>>();
2513+
2514+
let mut available_blocks = vec![];
2515+
for blinded in historical_blocks {
2516+
let block_root = blinded.canonical_root();
2517+
let full_block = harness
2518+
.chain
2519+
.get_block(&block_root)
2520+
.await
2521+
.expect("should get block")
2522+
.expect("should get block");
24782523

2479-
let mut available_blocks = vec![];
2480-
for blinded in historical_blocks {
2481-
let block_root = blinded.canonical_root();
2482-
let full_block = harness
2483-
.chain
2484-
.get_block(&block_root)
2485-
.await
2486-
.expect("should get block")
2487-
.expect("should get block");
2524+
if let MaybeAvailableBlock::Available(block) = harness
2525+
.chain
2526+
.data_availability_checker
2527+
.verify_kzg_for_rpc_block(
2528+
harness
2529+
.build_rpc_block_from_store_blobs(Some(block_root), Arc::new(full_block)),
2530+
)
2531+
.expect("should verify kzg")
2532+
{
2533+
available_blocks.push(block);
2534+
}
2535+
}
24882536

2489-
if let MaybeAvailableBlock::Available(block) = harness
2490-
.chain
2491-
.data_availability_checker
2492-
.verify_kzg_for_rpc_block(
2493-
harness.build_rpc_block_from_store_blobs(Some(block_root), Arc::new(full_block)),
2537+
// Corrupt the signature on the 1st block to ensure that the backfill processor is checking
2538+
// signatures correctly. Regression test for https://github.com/sigp/lighthouse/pull/5120.
2539+
let mut batch_with_invalid_first_block =
2540+
available_blocks.iter().map(clone_block).collect::<Vec<_>>();
2541+
batch_with_invalid_first_block[0] = {
2542+
let (block_root, block, data) = clone_block(&available_blocks[0]).deconstruct();
2543+
let mut corrupt_block = (*block).clone();
2544+
*corrupt_block.signature_mut() = Signature::empty();
2545+
AvailableBlock::__new_for_testing(
2546+
block_root,
2547+
Arc::new(corrupt_block),
2548+
data,
2549+
Arc::new(spec),
24942550
)
2495-
.expect("should verify kzg")
2496-
{
2497-
available_blocks.push(block);
2498-
}
2499-
}
2551+
};
2552+
2553+
// Importing the invalid batch should error.
2554+
assert!(matches!(
2555+
beacon_chain
2556+
.import_historical_block_batch(batch_with_invalid_first_block)
2557+
.unwrap_err(),
2558+
HistoricalBlockError::InvalidSignature
2559+
));
25002560

2501-
// Corrupt the signature on the 1st block to ensure that the backfill processor is checking
2502-
// signatures correctly. Regression test for https://github.com/sigp/lighthouse/pull/5120.
2503-
let mut batch_with_invalid_first_block =
2504-
available_blocks.iter().map(clone_block).collect::<Vec<_>>();
2505-
batch_with_invalid_first_block[0] = {
2506-
let (block_root, block, data) = clone_block(&available_blocks[0]).deconstruct();
2507-
let mut corrupt_block = (*block).clone();
2508-
*corrupt_block.signature_mut() = Signature::empty();
2509-
AvailableBlock::__new_for_testing(block_root, Arc::new(corrupt_block), data, Arc::new(spec))
2510-
};
2561+
let available_blocks_slots = available_blocks
2562+
.iter()
2563+
.map(|block| (block.block().slot(), block.block().canonical_root()))
2564+
.collect::<Vec<_>>();
2565+
info!(
2566+
?available_blocks_slots,
2567+
"wss_block_slot" = wss_block.slot().as_usize(),
2568+
"Importing historical block batch"
2569+
);
25112570

2512-
// Importing the invalid batch should error.
2513-
assert!(matches!(
2571+
// Importing the batch with valid signatures should succeed.
2572+
let available_blocks_dup = available_blocks.iter().map(clone_block).collect::<Vec<_>>();
2573+
assert_eq!(beacon_chain.store.get_oldest_block_slot(), wss_block.slot());
25142574
beacon_chain
2515-
.import_historical_block_batch(batch_with_invalid_first_block)
2516-
.unwrap_err(),
2517-
HistoricalBlockError::InvalidSignature
2518-
));
2519-
2520-
let available_blocks_slots = available_blocks
2521-
.iter()
2522-
.map(|block| (block.block().slot(), block.block().canonical_root()))
2523-
.collect::<Vec<_>>();
2524-
info!(
2525-
?available_blocks_slots,
2526-
"wss_block_slot" = wss_block.slot().as_usize(),
2527-
"Importing historical block batch"
2528-
);
2529-
2530-
// Importing the batch with valid signatures should succeed.
2531-
let available_blocks_dup = available_blocks.iter().map(clone_block).collect::<Vec<_>>();
2532-
assert_eq!(beacon_chain.store.get_oldest_block_slot(), wss_block.slot());
2533-
beacon_chain
2534-
.import_historical_block_batch(available_blocks_dup)
2535-
.unwrap();
2536-
assert_eq!(beacon_chain.store.get_oldest_block_slot(), 0);
2575+
.import_historical_block_batch(available_blocks_dup)
2576+
.unwrap();
2577+
assert_eq!(beacon_chain.store.get_oldest_block_slot(), 0);
25372578

2538-
// Resupplying the blocks should not fail, they can be safely ignored.
2539-
beacon_chain
2540-
.import_historical_block_batch(available_blocks)
2541-
.unwrap();
2579+
// Resupplying the blocks should not fail, they can be safely ignored.
2580+
beacon_chain
2581+
.import_historical_block_batch(available_blocks)
2582+
.unwrap();
2583+
}
25422584

25432585
// Sanity check for non-aligned WSS starts, to make sure the WSS block is persisted properly
25442586
if wss_block_slot != wss_state_slot {
@@ -2615,7 +2657,11 @@ async fn weak_subjectivity_sync_test(slots: Vec<Slot>, checkpoint_slot: Slot) {
26152657
assert_eq!(store.get_anchor_info().anchor_slot, wss_aligned_slot);
26162658
assert_eq!(
26172659
store.get_anchor_info().state_upper_limit,
2618-
Slot::new(u64::MAX)
2660+
if checkpoint_slot == 0 {
2661+
Slot::new(0)
2662+
} else {
2663+
Slot::new(u64::MAX)
2664+
}
26192665
);
26202666
info!(anchor = ?store.get_anchor_info(), "anchor pre");
26212667

beacon_node/store/src/reconstruct.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ where
4747
let lower_limit_slot = anchor.state_lower_limit;
4848
let upper_limit_slot = std::cmp::min(split.slot, anchor.state_upper_limit);
4949

50+
// If the split is at 0 we can't reconstruct historic states.
51+
if split.slot == 0 {
52+
debug!("No state reconstruction possible");
53+
return Ok(());
54+
}
55+
5056
// If `num_blocks` is not specified iterate all blocks. Add 1 so that we end on an epoch
5157
// boundary when `num_blocks` is a multiple of an epoch boundary. We want to be *inclusive*
5258
// of the state at slot `lower_limit_slot + num_blocks`.

0 commit comments

Comments
 (0)