Skip to content

Commit 25f0e26

Browse files
committed
Don't return errors when fork choice fails (#3370)
## Issue Addressed NA ## Proposed Changes There are scenarios where the only viable head will have an invalid execution payload, in this scenario the `get_head` function on `proto_array` will return an error. We must recover from this scenario by importing blocks from the network. This PR stops `BeaconChain::recompute_head` from returning an error so that we can't accidentally start down-scoring peers or aborting block import just because the current head has an invalid payload. ## Reviewer Notes The following changes are included: 1. Allow `fork_choice.get_head` to fail gracefully in `BeaconChain::process_block` when trying to update the `early_attester_cache`; simply don't add the block to the cache rather than aborting the entire process. 1. Don't return an error from `BeaconChain::recompute_head_at_current_slot` and `BeaconChain::recompute_head` to defensively prevent calling functions from aborting any process just because the fork choice function failed to run. - This should have practically no effect, since most callers were still continuing if recomputing the head failed. - The outlier is that the API will return 200 rather than a 500 when fork choice fails. 1. Add the `ProtoArrayForkChoice::set_all_blocks_to_optimistic` function to recover from the scenario where we've rebooted and the persisted fork choice has an invalid head.
1 parent d04fde3 commit 25f0e26

File tree

16 files changed

+466
-147
lines changed

16 files changed

+466
-147
lines changed

beacon_node/beacon_chain/src/beacon_chain.rs

Lines changed: 29 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2805,32 +2805,38 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
28052805
if !payload_verification_status.is_optimistic()
28062806
&& block.slot() + EARLY_ATTESTER_CACHE_HISTORIC_SLOTS >= current_slot
28072807
{
2808-
let new_head_root = fork_choice
2809-
.get_head(current_slot, &self.spec)
2810-
.map_err(BeaconChainError::from)?;
2811-
2812-
if new_head_root == block_root {
2813-
if let Some(proto_block) = fork_choice.get_block(&block_root) {
2814-
if let Err(e) = self.early_attester_cache.add_head_block(
2815-
block_root,
2816-
signed_block.clone(),
2817-
proto_block,
2818-
&state,
2819-
&self.spec,
2820-
) {
2808+
match fork_choice.get_head(current_slot, &self.spec) {
2809+
// This block became the head, add it to the early attester cache.
2810+
Ok(new_head_root) if new_head_root == block_root => {
2811+
if let Some(proto_block) = fork_choice.get_block(&block_root) {
2812+
if let Err(e) = self.early_attester_cache.add_head_block(
2813+
block_root,
2814+
signed_block.clone(),
2815+
proto_block,
2816+
&state,
2817+
&self.spec,
2818+
) {
2819+
warn!(
2820+
self.log,
2821+
"Early attester cache insert failed";
2822+
"error" => ?e
2823+
);
2824+
}
2825+
} else {
28212826
warn!(
28222827
self.log,
2823-
"Early attester cache insert failed";
2824-
"error" => ?e
2828+
"Early attester block missing";
2829+
"block_root" => ?block_root
28252830
);
28262831
}
2827-
} else {
2828-
warn!(
2829-
self.log,
2830-
"Early attester block missing";
2831-
"block_root" => ?block_root
2832-
);
28332832
}
2833+
// This block did not become the head, nothing to do.
2834+
Ok(_) => (),
2835+
Err(e) => error!(
2836+
self.log,
2837+
"Failed to compute head during block import";
2838+
"error" => ?e
2839+
),
28342840
}
28352841
}
28362842

@@ -3608,16 +3614,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
36083614

36093615
// Run fork choice since it's possible that the payload invalidation might result in a new
36103616
// head.
3611-
//
3612-
// Don't return early though, since invalidating the justified checkpoint might cause an
3613-
// error here.
3614-
if let Err(e) = self.recompute_head_at_current_slot().await {
3615-
crit!(
3616-
self.log,
3617-
"Failed to run fork choice routine";
3618-
"error" => ?e,
3619-
);
3620-
}
3617+
self.recompute_head_at_current_slot().await;
36213618

36223619
// Obtain the justified root from fork choice.
36233620
//
@@ -4262,14 +4259,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
42624259
}
42634260

42644261
// Run fork choice and signal to any waiting task that it has completed.
4265-
if let Err(e) = self.recompute_head_at_current_slot().await {
4266-
error!(
4267-
self.log,
4268-
"Fork choice error at slot start";
4269-
"error" => ?e,
4270-
"slot" => slot,
4271-
);
4272-
}
4262+
self.recompute_head_at_current_slot().await;
42734263

42744264
// Send the notification regardless of fork choice success, this is a "best effort"
42754265
// notification and we don't want block production to hit the timeout in case of error.

beacon_node/beacon_chain/src/canonical_head.rs

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -434,9 +434,15 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
434434
/// Execute the fork choice algorithm and enthrone the result as the canonical head.
435435
///
436436
/// This method replaces the old `BeaconChain::fork_choice` method.
437-
pub async fn recompute_head_at_current_slot(self: &Arc<Self>) -> Result<(), Error> {
438-
let current_slot = self.slot()?;
439-
self.recompute_head_at_slot(current_slot).await
437+
pub async fn recompute_head_at_current_slot(self: &Arc<Self>) {
438+
match self.slot() {
439+
Ok(current_slot) => self.recompute_head_at_slot(current_slot).await,
440+
Err(e) => error!(
441+
self.log,
442+
"No slot when recomputing head";
443+
"error" => ?e
444+
),
445+
}
440446
}
441447

442448
/// Execute the fork choice algorithm and enthrone the result as the canonical head.
@@ -445,7 +451,13 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
445451
/// different slot to the wall-clock can be useful for pushing fork choice into the next slot
446452
/// *just* before the start of the slot. This ensures that block production can use the correct
447453
/// head value without being delayed.
448-
pub async fn recompute_head_at_slot(self: &Arc<Self>, current_slot: Slot) -> Result<(), Error> {
454+
///
455+
/// This function purposefully does *not* return a `Result`. It's possible for fork choice to
456+
/// fail to update if there is only one viable head and it has an invalid execution payload. In
457+
/// such a case it's critical that the `BeaconChain` keeps importing blocks so that the
458+
/// situation can be rectified. We avoid returning an error here so that calling functions
459+
/// can't abort block import because an error is returned here.
460+
pub async fn recompute_head_at_slot(self: &Arc<Self>, current_slot: Slot) {
449461
metrics::inc_counter(&metrics::FORK_CHOICE_REQUESTS);
450462
let _timer = metrics::start_timer(&metrics::FORK_CHOICE_TIMES);
451463

@@ -455,23 +467,22 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
455467
move || chain.recompute_head_at_slot_internal(current_slot),
456468
"recompute_head_internal",
457469
)
458-
.await?
470+
.await
459471
{
460472
// Fork choice returned successfully and did not need to update the EL.
461-
Ok(None) => Ok(()),
473+
Ok(Ok(None)) => (),
462474
// Fork choice returned successfully and needed to update the EL. It has returned a
463475
// join-handle from when it spawned some async tasks. We should await those tasks.
464-
Ok(Some(join_handle)) => match join_handle.await {
476+
Ok(Ok(Some(join_handle))) => match join_handle.await {
465477
// The async task completed successfully.
466-
Ok(Some(())) => Ok(()),
478+
Ok(Some(())) => (),
467479
// The async task did not complete successfully since the runtime is shutting down.
468480
Ok(None) => {
469481
debug!(
470482
self.log,
471483
"Did not update EL fork choice";
472484
"info" => "shutting down"
473485
);
474-
Err(Error::RuntimeShutdown)
475486
}
476487
// The async task did not complete successfully, tokio returned an error.
477488
Err(e) => {
@@ -480,13 +491,24 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
480491
"Did not update EL fork choice";
481492
"error" => ?e
482493
);
483-
Err(Error::TokioJoin(e))
484494
}
485495
},
486496
// There was an error recomputing the head.
487-
Err(e) => {
497+
Ok(Err(e)) => {
488498
metrics::inc_counter(&metrics::FORK_CHOICE_ERRORS);
489-
Err(e)
499+
error!(
500+
self.log,
501+
"Error whist recomputing head";
502+
"error" => ?e
503+
);
504+
}
505+
// There was an error spawning the task.
506+
Err(e) => {
507+
error!(
508+
self.log,
509+
"Failed to spawn recompute head task";
510+
"error" => ?e
511+
);
490512
}
491513
}
492514
}

beacon_node/beacon_chain/src/state_advance_timer.rs

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -220,14 +220,7 @@ async fn state_advance_timer<T: BeaconChainTypes>(
220220
return;
221221
}
222222

223-
if let Err(e) = beacon_chain.recompute_head_at_slot(next_slot).await {
224-
warn!(
225-
log,
226-
"Error updating fork choice for next slot";
227-
"error" => ?e,
228-
"slot" => next_slot,
229-
);
230-
}
223+
beacon_chain.recompute_head_at_slot(next_slot).await;
231224

232225
// Use a blocking task to avoid blocking the core executor whilst waiting for locks
233226
// in `ForkChoiceSignalTx`.

beacon_node/beacon_chain/src/test_utils.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,20 @@ impl<E: EthSpec> Builder<EphemeralHarnessType<E>> {
211211
self.store = Some(store);
212212
self.store_mutator(Box::new(mutator))
213213
}
214+
215+
/// Manually restore from a given `MemoryStore`.
216+
pub fn resumed_ephemeral_store(
217+
mut self,
218+
store: Arc<HotColdDB<E, MemoryStore<E>, MemoryStore<E>>>,
219+
) -> Self {
220+
let mutator = move |builder: BeaconChainBuilder<_>| {
221+
builder
222+
.resume_from_db()
223+
.expect("should resume from database")
224+
};
225+
self.store = Some(store);
226+
self.store_mutator(Box::new(mutator))
227+
}
214228
}
215229

216230
impl<E: EthSpec> Builder<DiskHarnessType<E>> {
@@ -1376,7 +1390,7 @@ where
13761390
.process_block(Arc::new(block), CountUnrealized::True)
13771391
.await?
13781392
.into();
1379-
self.chain.recompute_head_at_current_slot().await?;
1393+
self.chain.recompute_head_at_current_slot().await;
13801394
Ok(block_hash)
13811395
}
13821396

@@ -1389,7 +1403,7 @@ where
13891403
.process_block(Arc::new(block), CountUnrealized::True)
13901404
.await?
13911405
.into();
1392-
self.chain.recompute_head_at_current_slot().await?;
1406+
self.chain.recompute_head_at_current_slot().await;
13931407
Ok(block_hash)
13941408
}
13951409

beacon_node/beacon_chain/tests/block_verification.rs

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,7 @@ async fn chain_segment_full_segment() {
160160
.into_block_error()
161161
.expect("should import chain segment");
162162

163-
harness
164-
.chain
165-
.recompute_head_at_current_slot()
166-
.await
167-
.expect("should run fork choice");
163+
harness.chain.recompute_head_at_current_slot().await;
168164

169165
assert_eq!(
170166
harness.head_block_root(),
@@ -194,11 +190,7 @@ async fn chain_segment_varying_chunk_size() {
194190
.unwrap_or_else(|_| panic!("should import chain segment of len {}", chunk_size));
195191
}
196192

197-
harness
198-
.chain
199-
.recompute_head_at_current_slot()
200-
.await
201-
.expect("should run fork choice");
193+
harness.chain.recompute_head_at_current_slot().await;
202194

203195
assert_eq!(
204196
harness.head_block_root(),
@@ -729,11 +721,7 @@ async fn block_gossip_verification() {
729721
}
730722

731723
// Recompute the head to ensure we cache the latest view of fork choice.
732-
harness
733-
.chain
734-
.recompute_head_at_current_slot()
735-
.await
736-
.unwrap();
724+
harness.chain.recompute_head_at_current_slot().await;
737725

738726
/*
739727
* This test ensures that:

0 commit comments

Comments
 (0)