@@ -515,6 +515,59 @@ impl Tower {
515515 . map ( |last_voted_slot| {
516516 let root = self . root ( ) ;
517517 let empty_ancestors = HashSet :: default ( ) ;
518+ let empty_ancestors_due_to_minor_unsynced_ledger = || {
519+ // This condition (stale stray last vote) shouldn't occur under normal validator
520+ // operation, indicating something unusual happened.
521+ // This condition could be introduced by manual ledger mishandling,
522+ // validator SEGV, OS/HW crash, or plain No Free Space FS error.
523+
524+ // However, returning empty ancestors as a fallback here shouldn't result in
525+ // slashing by itself (Note that we couldn't fully preclude any kind of slashing if
526+ // the failure was OS or HW level).
527+
528+ // Firstly, lockout is ensured elsewhere.
529+
530+ // Also, there is no risk of optimistic conf. violation. Although empty ancestors
531+ // could result in incorrect (= more than actual) locked_out_stake and
532+ // false-positive SwitchProof later in this function, there should be no such a
533+ // heavier fork candidate, first of all, if the last vote (or any of its
534+ // unavailable ancestors) were already optimistically confirmed.
535+ // The only exception is that other validator is already violating it...
536+ if self . is_first_switch_check ( ) && switch_slot < last_voted_slot {
537+ // `switch < last` is needed not to warn! this message just because of using
538+ // newer snapshots on validator restart
539+ let message = format ! (
540+ "bank_forks doesn't have corresponding data for the stray restored \
541+ last vote({}), meaning some inconsistency between saved tower and ledger.",
542+ last_voted_slot
543+ ) ;
544+ warn ! ( "{}" , message) ;
545+ datapoint_warn ! ( "tower_warn" , ( "warn" , message, String ) ) ;
546+ }
547+ & empty_ancestors
548+ } ;
549+
550+ let suspended_decision_due_to_major_unsynced_ledger = || {
551+ // This peculiar corner handling is needed mainly for a tower which is newer than
552+ // blockstore. (Yeah, we tolerate it for ease of maintaining validator by operators)
553+ // This condition could be introduced by manual ledger mishandling,
554+ // validator SEGV, OS/HW crash, or plain No Free Space FS error.
555+
556+ // When we're in this clause, it basically means validator is badly running
557+ // with a future tower while replaying past slots, especially problematic is
558+ // last_voted_slot.
559+ // So, don't re-vote on it by returning pseudo FailedSwitchThreshold, otherwise
560+ // there would be slashing because of double vote on one of last_vote_ancestors.
561+ // (Well, needless to say, re-creating the duplicate block must be handled properly
562+ // at the banking stage: https://github.com/solana-labs/solana/issues/8232)
563+ //
564+ // To be specific, the replay stage is tricked into a false perception where
565+ // last_vote_ancestors is AVAILABLE for descendant-of-`switch_slot`, stale, and
566+ // stray slots (which should always be empty_ancestors).
567+ //
568+ // This is covered by test_future_tower_* in local_cluster
569+ SwitchForkDecision :: FailedSwitchThreshold ( 0 , total_stake)
570+ } ;
518571
519572 let last_vote_ancestors =
520573 ancestors. get ( & last_voted_slot) . unwrap_or_else ( || {
@@ -529,35 +582,7 @@ impl Tower {
529582 // all of them.
530583 panic ! ( "no ancestors found with slot: {}" , last_voted_slot) ;
531584 } else {
532- // This condition (stale stray last vote) shouldn't occur under normal validator
533- // operation, indicating something unusual happened.
534- // Possible causes include: OS/HW crash, validator process crash, only saved tower
535- // is moved over to a new setup, etc...
536-
537- // However, returning empty ancestors as a fallback here shouldn't result in
538- // slashing by itself (Note that we couldn't fully preclude any kind of slashing if
539- // the failure was OS or HW level).
540-
541- // Firstly, lockout is ensured elsewhere.
542-
543- // Also, there is no risk of optimistic conf. violation. Although empty ancestors
544- // could result in incorrect (= more than actual) locked_out_stake and
545- // false-positive SwitchProof later in this function, there should be no such a
546- // heavier fork candidate, first of all, if the last vote (or any of its
547- // unavailable ancestors) were already optimistically confirmed.
548- // The only exception is that other validator is already violating it...
549- if self . is_first_switch_check ( ) && switch_slot < last_voted_slot {
550- // `switch < last` is needed not to warn! this message just because of using
551- // newer snapshots on validator restart
552- let message = format ! (
553- "bank_forks doesn't have corresponding data for the stray restored \
554- last vote({}), meaning some inconsistency between saved tower and ledger.",
555- last_voted_slot
556- ) ;
557- warn ! ( "{}" , message) ;
558- datapoint_warn ! ( "tower_warn" , ( "warn" , message, String ) ) ;
559- }
560- & empty_ancestors
585+ empty_ancestors_due_to_minor_unsynced_ledger ( )
561586 }
562587 } ) ;
563588
@@ -569,13 +594,18 @@ impl Tower {
569594 return SwitchForkDecision :: SameFork ;
570595 }
571596
572- assert ! (
573- !last_vote_ancestors. contains( & switch_slot) ,
574- "Should never consider switching to slot ({}), which is ancestors({:?}) of last vote: {}" ,
575- switch_slot,
576- last_vote_ancestors,
577- last_voted_slot
578- ) ;
597+ if last_vote_ancestors. contains ( & switch_slot) {
598+ if !self . is_stray_last_vote ( ) {
599+ panic ! (
600+ "Should never consider switching to slot ({}), which is ancestors({:?}) of last vote: {}" ,
601+ switch_slot,
602+ last_vote_ancestors,
603+ last_voted_slot
604+ ) ;
605+ } else {
606+ return suspended_decision_due_to_major_unsynced_ledger ( ) ;
607+ }
608+ }
579609
580610 // By this point, we know the `switch_slot` is on a different fork
581611 // (is neither an ancestor nor descendant of `last_vote`), so a
@@ -846,14 +876,6 @@ impl Tower {
846876 ) ;
847877 assert_eq ! ( slot_history. check( replayed_root) , Check :: Found ) ;
848878
849- // reconcile_blockstore_roots_with_tower() should already have aligned these.
850- assert ! (
851- tower_root <= replayed_root,
852- format!(
853- "tower root: {:?} >= replayed root slot: {}" ,
854- tower_root, replayed_root
855- )
856- ) ;
857879 assert ! (
858880 self . last_vote == Vote :: default ( ) && self . lockouts. votes. is_empty( )
859881 || self . last_vote != Vote :: default ( ) && !self . lockouts. votes. is_empty( ) ,
@@ -864,16 +886,59 @@ impl Tower {
864886 ) ;
865887
866888 if let Some ( last_voted_slot) = self . last_voted_slot ( ) {
867- if slot_history. check ( last_voted_slot) == Check :: TooOld {
868- // We could try hard to anchor with other older votes, but opt to simplify the
869- // following logic
870- return Err ( TowerError :: TooOldTower (
889+ if tower_root <= replayed_root {
890+ // Normally, we goes into this clause with possible help of
891+ // reconcile_blockstore_roots_with_tower()
892+ if slot_history. check ( last_voted_slot) == Check :: TooOld {
893+ // We could try hard to anchor with other older votes, but opt to simplify the
894+ // following logic
895+ return Err ( TowerError :: TooOldTower (
896+ last_voted_slot,
897+ slot_history. oldest ( ) ,
898+ ) ) ;
899+ }
900+
901+ self . adjust_lockouts_with_slot_history ( slot_history) ?;
902+ self . initialize_root ( replayed_root) ;
903+ } else {
904+ // This should never occur under normal operation.
905+ // While this validator's voting is suspended this way,
906+ // suspended_decision_due_to_major_unsynced_ledger() will be also touched.
907+ let message = format ! (
908+ "For some reason, we're REPROCESSING slots which has already been \
909+ voted and ROOTED by us; \
910+ VOTING will be SUSPENDED UNTIL {}!",
871911 last_voted_slot,
872- slot_history. oldest ( ) ,
873- ) ) ;
912+ ) ;
913+ error ! ( "{}" , message) ;
914+ datapoint_error ! ( "tower_error" , ( "error" , message, String ) ) ;
915+
916+ // Let's pass-through adjust_lockouts_with_slot_history just for sanitization,
917+ // using a synthesized SlotHistory.
918+
919+ let mut warped_slot_history = ( * slot_history) . clone ( ) ;
920+ // Blockstore doesn't have the tower_root slot because of
921+ // (replayed_root < tower_root) in this else clause, meaning the tower is from
922+ // the future from the view of blockstore.
923+ // Pretend the blockstore has the future tower_root to anchor exactly with that
924+ // slot by adding tower_root to a slot history. The added slot will be newer
925+ // than all slots in the slot history (remember tower_root > replayed_root),
926+ // satisfying the slot history invariant.
927+ // Thus, the whole process will be safe as well because tower_root exists
928+ // within both tower and slot history, guaranteeing the success of adjustment
929+ // and retaining all of future votes correctly while sanitizing.
930+ warped_slot_history. add ( tower_root) ;
931+
932+ self . adjust_lockouts_with_slot_history ( & warped_slot_history) ?;
933+ // don't update root; future tower's root should be kept across validator
934+ // restarts to continue to show the scary messages at restarts until the next
935+ // voting.
874936 }
875- self . adjust_lockouts_with_slot_history ( slot_history) ?;
876- self . initialize_root ( replayed_root) ;
937+ } else {
938+ // This else clause is for newly created tower.
939+ // initialize_lockouts_from_bank() should ensure the following invariant,
940+ // otherwise we're screwing something up.
941+ assert_eq ! ( tower_root, replayed_root) ;
877942 }
878943
879944 Ok ( self )
@@ -1152,8 +1217,11 @@ impl SavedTower {
11521217 }
11531218}
11541219
1155- // Given an untimely crash, tower may have roots that are not reflected in blockstore because
1156- // `ReplayState::handle_votable_bank()` saves tower before setting blockstore roots
1220+ // Given an untimely crash, tower may have roots that are not reflected in blockstore,
1221+ // or the reverse of this.
1222+ // That's because we don't impose any ordering guarantee or any kind of write barriers
1223+ // between tower (plain old POSIX fs calls) and blockstore (through RocksDB), when
1224+ // `ReplayState::handle_votable_bank()` saves tower before setting blockstore roots.
11571225pub fn reconcile_blockstore_roots_with_tower (
11581226 tower : & Tower ,
11591227 blockstore : & Blockstore ,
@@ -1173,12 +1241,23 @@ pub fn reconcile_blockstore_roots_with_tower(
11731241 ) ,
11741242 } )
11751243 . collect ( ) ;
1176- assert ! (
1177- !new_roots. is_empty( ) ,
1178- "at least 1 parent slot must be found"
1179- ) ;
1180-
1181- blockstore. set_roots ( & new_roots) ?;
1244+ if !new_roots. is_empty ( ) {
1245+ info ! (
1246+ "Reconciling slots as root based on tower root: {:?} ({}..{}) " ,
1247+ new_roots, tower_root, last_blockstore_root
1248+ ) ;
1249+ blockstore. set_roots ( & new_roots) ?;
1250+ } else {
1251+ // This indicates we're in bad state; but still don't panic here.
1252+ // That's because we might have a chance of recovering properly with
1253+ // newer snapshot.
1254+ warn ! (
1255+ "Couldn't find any ancestor slots from tower root ({}) \
1256+ towards blockstore root ({}); blockstore pruned or only \
1257+ tower moved into new ledger?",
1258+ tower_root, last_blockstore_root,
1259+ ) ;
1260+ }
11821261 }
11831262 Ok ( ( ) )
11841263}
@@ -2700,8 +2779,7 @@ pub mod test {
27002779 }
27012780
27022781 #[ test]
2703- #[ should_panic( expected = "at least 1 parent slot must be found" ) ]
2704- fn test_reconcile_blockstore_roots_with_tower_panic_no_parent ( ) {
2782+ fn test_reconcile_blockstore_roots_with_tower_nop_no_parent ( ) {
27052783 solana_logger:: setup ( ) ;
27062784 let blockstore_path = get_tmp_ledger_path ! ( ) ;
27072785 {
@@ -2717,7 +2795,9 @@ pub mod test {
27172795
27182796 let mut tower = Tower :: new_with_key ( & Pubkey :: default ( ) ) ;
27192797 tower. lockouts . root_slot = Some ( 4 ) ;
2798+ assert_eq ! ( blockstore. last_root( ) , 0 ) ;
27202799 reconcile_blockstore_roots_with_tower ( & tower, & blockstore) . unwrap ( ) ;
2800+ assert_eq ! ( blockstore. last_root( ) , 0 ) ;
27212801 }
27222802 Blockstore :: destroy ( & blockstore_path) . expect ( "Expected successful database destruction" ) ;
27232803 }
@@ -3068,4 +3148,25 @@ pub mod test {
30683148
30693149 assert ! ( tower. adjust_lockouts_after_replay( 0 , & slot_history) . is_ok( ) ) ;
30703150 }
3151+
3152+ #[ test]
3153+ fn test_adjust_lockouts_after_replay_future_tower ( ) {
3154+ let mut tower = Tower :: new_for_tests ( 10 , 0.9 ) ;
3155+ tower. lockouts . votes . push_back ( Lockout :: new ( 13 ) ) ;
3156+ tower. lockouts . votes . push_back ( Lockout :: new ( 14 ) ) ;
3157+ let vote = Vote :: new ( vec ! [ 14 ] , Hash :: default ( ) ) ;
3158+ tower. last_vote = vote;
3159+ tower. initialize_root ( 12 ) ;
3160+
3161+ let mut slot_history = SlotHistory :: default ( ) ;
3162+ slot_history. add ( 0 ) ;
3163+ slot_history. add ( 2 ) ;
3164+
3165+ let tower = tower
3166+ . adjust_lockouts_after_replay ( 2 , & slot_history)
3167+ . unwrap ( ) ;
3168+ assert_eq ! ( tower. root( ) , 12 ) ;
3169+ assert_eq ! ( tower. voted_slots( ) , vec![ 13 , 14 ] ) ;
3170+ assert_eq ! ( tower. stray_restored_slot, Some ( 14 ) ) ;
3171+ }
30713172}
0 commit comments