Skip to content
This repository was archived by the owner on Jan 22, 2025. It is now read-only.

Commit 1df15d8

Browse files
authored
Fix tower/blockstore unsync due to external causes (#12671)
* Fix tower/blockstore unsync due to external causes * Add and clean up long comments * Clean up test * Comment about warped_slot_history * Run test_future_tower with master-only/master-slave * Update comments about false leader condition
1 parent 9263ae1 commit 1df15d8

File tree

3 files changed

+287
-69
lines changed

3 files changed

+287
-69
lines changed

core/src/consensus.rs

Lines changed: 163 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,59 @@ impl Tower {
515515
.map(|last_voted_slot| {
516516
let root = self.root();
517517
let empty_ancestors = HashSet::default();
518+
let empty_ancestors_due_to_minor_unsynced_ledger = || {
519+
// This condition (stale stray last vote) shouldn't occur under normal validator
520+
// operation, indicating something unusual happened.
521+
// This condition could be introduced by manual ledger mishandling,
522+
// validator SEGV, OS/HW crash, or plain No Free Space FS error.
523+
524+
// However, returning empty ancestors as a fallback here shouldn't result in
525+
// slashing by itself (Note that we couldn't fully preclude any kind of slashing if
526+
// the failure was OS or HW level).
527+
528+
// Firstly, lockout is ensured elsewhere.
529+
530+
// Also, there is no risk of optimistic conf. violation. Although empty ancestors
531+
// could result in incorrect (= more than actual) locked_out_stake and
532+
// false-positive SwitchProof later in this function, there should be no such a
533+
// heavier fork candidate, first of all, if the last vote (or any of its
534+
// unavailable ancestors) were already optimistically confirmed.
535+
// The only exception is that other validator is already violating it...
536+
if self.is_first_switch_check() && switch_slot < last_voted_slot {
537+
// `switch < last` is needed not to warn! this message just because of using
538+
// newer snapshots on validator restart
539+
let message = format!(
540+
"bank_forks doesn't have corresponding data for the stray restored \
541+
last vote({}), meaning some inconsistency between saved tower and ledger.",
542+
last_voted_slot
543+
);
544+
warn!("{}", message);
545+
datapoint_warn!("tower_warn", ("warn", message, String));
546+
}
547+
&empty_ancestors
548+
};
549+
550+
let suspended_decision_due_to_major_unsynced_ledger = || {
551+
// This peculiar corner handling is needed mainly for a tower which is newer than
552+
// blockstore. (Yeah, we tolerate it for ease of maintaining validator by operators)
553+
// This condition could be introduced by manual ledger mishandling,
554+
// validator SEGV, OS/HW crash, or plain No Free Space FS error.
555+
556+
// When we're in this clause, it basically means validator is badly running
557+
// with a future tower while replaying past slots, especially problematic is
558+
// last_voted_slot.
559+
// So, don't re-vote on it by returning pseudo FailedSwitchThreshold, otherwise
560+
// there would be slashing because of double vote on one of last_vote_ancestors.
561+
// (Well, needless to say, re-creating the duplicate block must be handled properly
562+
// at the banking stage: https://github.com/solana-labs/solana/issues/8232)
563+
//
564+
// To be specific, the replay stage is tricked into a false perception where
565+
// last_vote_ancestors is AVAILABLE for descendant-of-`switch_slot`, stale, and
566+
// stray slots (which should always be empty_ancestors).
567+
//
568+
// This is covered by test_future_tower_* in local_cluster
569+
SwitchForkDecision::FailedSwitchThreshold(0, total_stake)
570+
};
518571

519572
let last_vote_ancestors =
520573
ancestors.get(&last_voted_slot).unwrap_or_else(|| {
@@ -529,35 +582,7 @@ impl Tower {
529582
// all of them.
530583
panic!("no ancestors found with slot: {}", last_voted_slot);
531584
} else {
532-
// This condition (stale stray last vote) shouldn't occur under normal validator
533-
// operation, indicating something unusual happened.
534-
// Possible causes include: OS/HW crash, validator process crash, only saved tower
535-
// is moved over to a new setup, etc...
536-
537-
// However, returning empty ancestors as a fallback here shouldn't result in
538-
// slashing by itself (Note that we couldn't fully preclude any kind of slashing if
539-
// the failure was OS or HW level).
540-
541-
// Firstly, lockout is ensured elsewhere.
542-
543-
// Also, there is no risk of optimistic conf. violation. Although empty ancestors
544-
// could result in incorrect (= more than actual) locked_out_stake and
545-
// false-positive SwitchProof later in this function, there should be no such a
546-
// heavier fork candidate, first of all, if the last vote (or any of its
547-
// unavailable ancestors) were already optimistically confirmed.
548-
// The only exception is that other validator is already violating it...
549-
if self.is_first_switch_check() && switch_slot < last_voted_slot {
550-
// `switch < last` is needed not to warn! this message just because of using
551-
// newer snapshots on validator restart
552-
let message = format!(
553-
"bank_forks doesn't have corresponding data for the stray restored \
554-
last vote({}), meaning some inconsistency between saved tower and ledger.",
555-
last_voted_slot
556-
);
557-
warn!("{}", message);
558-
datapoint_warn!("tower_warn", ("warn", message, String));
559-
}
560-
&empty_ancestors
585+
empty_ancestors_due_to_minor_unsynced_ledger()
561586
}
562587
});
563588

@@ -569,13 +594,18 @@ impl Tower {
569594
return SwitchForkDecision::SameFork;
570595
}
571596

572-
assert!(
573-
!last_vote_ancestors.contains(&switch_slot),
574-
"Should never consider switching to slot ({}), which is ancestors({:?}) of last vote: {}",
575-
switch_slot,
576-
last_vote_ancestors,
577-
last_voted_slot
578-
);
597+
if last_vote_ancestors.contains(&switch_slot) {
598+
if !self.is_stray_last_vote() {
599+
panic!(
600+
"Should never consider switching to slot ({}), which is ancestors({:?}) of last vote: {}",
601+
switch_slot,
602+
last_vote_ancestors,
603+
last_voted_slot
604+
);
605+
} else {
606+
return suspended_decision_due_to_major_unsynced_ledger();
607+
}
608+
}
579609

580610
// By this point, we know the `switch_slot` is on a different fork
581611
// (is neither an ancestor nor descendant of `last_vote`), so a
@@ -846,14 +876,6 @@ impl Tower {
846876
);
847877
assert_eq!(slot_history.check(replayed_root), Check::Found);
848878

849-
// reconcile_blockstore_roots_with_tower() should already have aligned these.
850-
assert!(
851-
tower_root <= replayed_root,
852-
format!(
853-
"tower root: {:?} >= replayed root slot: {}",
854-
tower_root, replayed_root
855-
)
856-
);
857879
assert!(
858880
self.last_vote == Vote::default() && self.lockouts.votes.is_empty()
859881
|| self.last_vote != Vote::default() && !self.lockouts.votes.is_empty(),
@@ -864,16 +886,59 @@ impl Tower {
864886
);
865887

866888
if let Some(last_voted_slot) = self.last_voted_slot() {
867-
if slot_history.check(last_voted_slot) == Check::TooOld {
868-
// We could try hard to anchor with other older votes, but opt to simplify the
869-
// following logic
870-
return Err(TowerError::TooOldTower(
889+
if tower_root <= replayed_root {
890+
// Normally, we goes into this clause with possible help of
891+
// reconcile_blockstore_roots_with_tower()
892+
if slot_history.check(last_voted_slot) == Check::TooOld {
893+
// We could try hard to anchor with other older votes, but opt to simplify the
894+
// following logic
895+
return Err(TowerError::TooOldTower(
896+
last_voted_slot,
897+
slot_history.oldest(),
898+
));
899+
}
900+
901+
self.adjust_lockouts_with_slot_history(slot_history)?;
902+
self.initialize_root(replayed_root);
903+
} else {
904+
// This should never occur under normal operation.
905+
// While this validator's voting is suspended this way,
906+
// suspended_decision_due_to_major_unsynced_ledger() will be also touched.
907+
let message = format!(
908+
"For some reason, we're REPROCESSING slots which has already been \
909+
voted and ROOTED by us; \
910+
VOTING will be SUSPENDED UNTIL {}!",
871911
last_voted_slot,
872-
slot_history.oldest(),
873-
));
912+
);
913+
error!("{}", message);
914+
datapoint_error!("tower_error", ("error", message, String));
915+
916+
// Let's pass-through adjust_lockouts_with_slot_history just for sanitization,
917+
// using a synthesized SlotHistory.
918+
919+
let mut warped_slot_history = (*slot_history).clone();
920+
// Blockstore doesn't have the tower_root slot because of
921+
// (replayed_root < tower_root) in this else clause, meaning the tower is from
922+
// the future from the view of blockstore.
923+
// Pretend the blockstore has the future tower_root to anchor exactly with that
924+
// slot by adding tower_root to a slot history. The added slot will be newer
925+
// than all slots in the slot history (remember tower_root > replayed_root),
926+
// satisfying the slot history invariant.
927+
// Thus, the whole process will be safe as well because tower_root exists
928+
// within both tower and slot history, guaranteeing the success of adjustment
929+
// and retaining all of future votes correctly while sanitizing.
930+
warped_slot_history.add(tower_root);
931+
932+
self.adjust_lockouts_with_slot_history(&warped_slot_history)?;
933+
// don't update root; future tower's root should be kept across validator
934+
// restarts to continue to show the scary messages at restarts until the next
935+
// voting.
874936
}
875-
self.adjust_lockouts_with_slot_history(slot_history)?;
876-
self.initialize_root(replayed_root);
937+
} else {
938+
// This else clause is for newly created tower.
939+
// initialize_lockouts_from_bank() should ensure the following invariant,
940+
// otherwise we're screwing something up.
941+
assert_eq!(tower_root, replayed_root);
877942
}
878943

879944
Ok(self)
@@ -1152,8 +1217,11 @@ impl SavedTower {
11521217
}
11531218
}
11541219

1155-
// Given an untimely crash, tower may have roots that are not reflected in blockstore because
1156-
// `ReplayState::handle_votable_bank()` saves tower before setting blockstore roots
1220+
// Given an untimely crash, tower may have roots that are not reflected in blockstore,
1221+
// or the reverse of this.
1222+
// That's because we don't impose any ordering guarantee or any kind of write barriers
1223+
// between tower (plain old POSIX fs calls) and blockstore (through RocksDB), when
1224+
// `ReplayState::handle_votable_bank()` saves tower before setting blockstore roots.
11571225
pub fn reconcile_blockstore_roots_with_tower(
11581226
tower: &Tower,
11591227
blockstore: &Blockstore,
@@ -1173,12 +1241,23 @@ pub fn reconcile_blockstore_roots_with_tower(
11731241
),
11741242
})
11751243
.collect();
1176-
assert!(
1177-
!new_roots.is_empty(),
1178-
"at least 1 parent slot must be found"
1179-
);
1180-
1181-
blockstore.set_roots(&new_roots)?;
1244+
if !new_roots.is_empty() {
1245+
info!(
1246+
"Reconciling slots as root based on tower root: {:?} ({}..{}) ",
1247+
new_roots, tower_root, last_blockstore_root
1248+
);
1249+
blockstore.set_roots(&new_roots)?;
1250+
} else {
1251+
// This indicates we're in bad state; but still don't panic here.
1252+
// That's because we might have a chance of recovering properly with
1253+
// newer snapshot.
1254+
warn!(
1255+
"Couldn't find any ancestor slots from tower root ({}) \
1256+
towards blockstore root ({}); blockstore pruned or only \
1257+
tower moved into new ledger?",
1258+
tower_root, last_blockstore_root,
1259+
);
1260+
}
11821261
}
11831262
Ok(())
11841263
}
@@ -2700,8 +2779,7 @@ pub mod test {
27002779
}
27012780

27022781
#[test]
2703-
#[should_panic(expected = "at least 1 parent slot must be found")]
2704-
fn test_reconcile_blockstore_roots_with_tower_panic_no_parent() {
2782+
fn test_reconcile_blockstore_roots_with_tower_nop_no_parent() {
27052783
solana_logger::setup();
27062784
let blockstore_path = get_tmp_ledger_path!();
27072785
{
@@ -2717,7 +2795,9 @@ pub mod test {
27172795

27182796
let mut tower = Tower::new_with_key(&Pubkey::default());
27192797
tower.lockouts.root_slot = Some(4);
2798+
assert_eq!(blockstore.last_root(), 0);
27202799
reconcile_blockstore_roots_with_tower(&tower, &blockstore).unwrap();
2800+
assert_eq!(blockstore.last_root(), 0);
27212801
}
27222802
Blockstore::destroy(&blockstore_path).expect("Expected successful database destruction");
27232803
}
@@ -3068,4 +3148,25 @@ pub mod test {
30683148

30693149
assert!(tower.adjust_lockouts_after_replay(0, &slot_history).is_ok());
30703150
}
3151+
3152+
#[test]
3153+
fn test_adjust_lockouts_after_replay_future_tower() {
3154+
let mut tower = Tower::new_for_tests(10, 0.9);
3155+
tower.lockouts.votes.push_back(Lockout::new(13));
3156+
tower.lockouts.votes.push_back(Lockout::new(14));
3157+
let vote = Vote::new(vec![14], Hash::default());
3158+
tower.last_vote = vote;
3159+
tower.initialize_root(12);
3160+
3161+
let mut slot_history = SlotHistory::default();
3162+
slot_history.add(0);
3163+
slot_history.add(2);
3164+
3165+
let tower = tower
3166+
.adjust_lockouts_after_replay(2, &slot_history)
3167+
.unwrap();
3168+
assert_eq!(tower.root(), 12);
3169+
assert_eq!(tower.voted_slots(), vec![13, 14]);
3170+
assert_eq!(tower.stray_restored_slot, Some(14));
3171+
}
30713172
}

0 commit comments

Comments
 (0)