@@ -30,11 +30,11 @@ use std::{
3030} ;
3131use thiserror:: Error ;
3232
33- #[ derive( PartialEq , Clone , Debug ) ]
33+ #[ derive( PartialEq , Clone , Debug , AbiExample ) ]
3434pub enum SwitchForkDecision {
3535 SwitchProof ( Hash ) ,
36- NoSwitch ,
37- FailedSwitchThreshold ,
36+ SameFork ,
37+ FailedSwitchThreshold ( u64 , u64 ) ,
3838}
3939
4040impl SwitchForkDecision {
@@ -45,8 +45,11 @@ impl SwitchForkDecision {
4545 authorized_voter_pubkey : & Pubkey ,
4646 ) -> Option < Instruction > {
4747 match self {
48- SwitchForkDecision :: FailedSwitchThreshold => None ,
49- SwitchForkDecision :: NoSwitch => Some ( vote_instruction:: vote (
48+ SwitchForkDecision :: FailedSwitchThreshold ( _, total_stake) => {
49+ assert_ne ! ( * total_stake, 0 ) ;
50+ None
51+ }
52+ SwitchForkDecision :: SameFork => Some ( vote_instruction:: vote (
5053 vote_account_pubkey,
5154 authorized_voter_pubkey,
5255 vote,
@@ -61,6 +64,10 @@ impl SwitchForkDecision {
6164 }
6265 }
6366 }
67+
68+ pub fn can_vote ( & self ) -> bool {
69+ !matches ! ( self , SwitchForkDecision :: FailedSwitchThreshold ( _, _) )
70+ }
6471}
6572
6673pub const VOTE_THRESHOLD_DEPTH : usize = 8 ;
@@ -101,6 +108,8 @@ pub struct Tower {
101108 // This could be emptied after some time; but left intact indefinitely for easier
102109 // implementation
103110 stray_restored_slot : Option < Slot > ,
111+ #[ serde( skip) ]
112+ pub last_switch_threshold_check : Option < ( Slot , SwitchForkDecision ) > ,
104113}
105114
106115impl Default for Tower {
@@ -115,6 +124,7 @@ impl Default for Tower {
115124 path : PathBuf :: default ( ) ,
116125 tmp_path : PathBuf :: default ( ) ,
117126 stray_restored_slot : Option :: default ( ) ,
127+ last_switch_threshold_check : Option :: default ( ) ,
118128 } ;
119129 // VoteState::root_slot is ensured to be Some in Tower
120130 tower. lockouts . root_slot = Some ( Slot :: default ( ) ) ;
@@ -493,7 +503,7 @@ impl Tower {
493503 false
494504 }
495505
496- pub ( crate ) fn check_switch_threshold (
506+ fn make_check_switch_threshold_decision (
497507 & self ,
498508 switch_slot : u64 ,
499509 ancestors : & HashMap < Slot , HashSet < u64 > > ,
@@ -520,9 +530,34 @@ impl Tower {
520530 // all of them.
521531 panic ! ( "no ancestors found with slot: {}" , last_voted_slot) ;
522532 } else {
523- // bank_forks doesn't have corresponding data for the stray restored last vote,
524- // meaning some inconsistency between saved tower and ledger.
525- // (newer snapshot, or only a saved tower is moved over to new setup?)
533+ // This condition shouldn't occur under normal validator operation, indicating
534+ // something unusual happened.
535+ // Possible causes include: OS/HW crash, validator process crash, only saved tower
536+ // is moved over to a new setup, etc...
537+
538+ // However, returning empty ancestors as a fallback here shouldn't result in
539+ // slashing by itself (Note that we couldn't fully preclude any kind of slashing if
540+ // the failure was OS or HW level).
541+
542+ // Firstly, lockout is ensured elsewhere.
543+
544+ // Also, there is no risk of optimistic conf. violation. Although empty ancestors
545+ // could result in incorrect (= more than actual) locked_out_stake and
546+ // false-positive SwitchProof later in this function, there should be no such a
547+ // heavier fork candidate, first of all, if the last vote (or any of its
548+ // unavailable ancestors) were already optimistically confirmed.
549+ // The only exception is that other validator is already violating it...
550+ if self . is_first_switch_check ( ) && switch_slot < last_voted_slot {
551+ // `switch < last` is needed not to warn! this message just because of using
552+ // newer snapshots on validator restart
553+ let message = format ! (
554+ "bank_forks doesn't have corresponding data for the stray restored \
555+ last vote({}), meaning some inconsistency between saved tower and ledger.",
556+ last_voted_slot
557+ ) ;
558+ warn ! ( "{}" , message) ;
559+ datapoint_warn ! ( "tower_warn" , ( "warn" , message, String ) ) ;
560+ }
526561 & empty_ancestors
527562 }
528563 } ) ;
@@ -532,7 +567,7 @@ impl Tower {
532567 if switch_slot == last_voted_slot || switch_slot_ancestors. contains ( & last_voted_slot) {
533568 // If the `switch_slot is a descendant of the last vote,
534569 // no switching proof is necessary
535- return SwitchForkDecision :: NoSwitch ;
570+ return SwitchForkDecision :: SameFork ;
536571 }
537572
538573 // Should never consider switching to an ancestor
@@ -598,7 +633,7 @@ impl Tower {
598633 }
599634
600635 // Only count lockouts on slots that are:
601- // 1) Not ancestors of `last_vote`
636+ // 1) Not ancestors of `last_vote`, meaning being on different fork
602637 // 2) Not from before the current root as we can't determine if
603638 // anything before the root was an ancestor of `last_vote` or not
604639 if !last_vote_ancestors. contains ( lockout_interval_start)
@@ -622,10 +657,43 @@ impl Tower {
622657 if ( locked_out_stake as f64 / total_stake as f64 ) > SWITCH_FORK_THRESHOLD {
623658 SwitchForkDecision :: SwitchProof ( switch_proof)
624659 } else {
625- SwitchForkDecision :: FailedSwitchThreshold
660+ SwitchForkDecision :: FailedSwitchThreshold ( locked_out_stake , total_stake )
626661 }
627662 } )
628- . unwrap_or ( SwitchForkDecision :: NoSwitch )
663+ . unwrap_or ( SwitchForkDecision :: SameFork )
664+ }
665+
666+ pub ( crate ) fn check_switch_threshold (
667+ & mut self ,
668+ switch_slot : u64 ,
669+ ancestors : & HashMap < Slot , HashSet < u64 > > ,
670+ descendants : & HashMap < Slot , HashSet < u64 > > ,
671+ progress : & ProgressMap ,
672+ total_stake : u64 ,
673+ epoch_vote_accounts : & HashMap < Pubkey , ( u64 , Account ) > ,
674+ ) -> SwitchForkDecision {
675+ let decision = self . make_check_switch_threshold_decision (
676+ switch_slot,
677+ ancestors,
678+ descendants,
679+ progress,
680+ total_stake,
681+ epoch_vote_accounts,
682+ ) ;
683+ let new_check = Some ( ( switch_slot, decision. clone ( ) ) ) ;
684+ if new_check != self . last_switch_threshold_check {
685+ trace ! (
686+ "new switch threshold check: slot {}: {:?}" ,
687+ switch_slot,
688+ decision,
689+ ) ;
690+ self . last_switch_threshold_check = new_check;
691+ }
692+ decision
693+ }
694+
695+ fn is_first_switch_check ( & self ) -> bool {
696+ self . last_switch_threshold_check . is_none ( )
629697 }
630698
631699 pub fn check_vote_stake_threshold (
@@ -932,9 +1000,9 @@ impl Tower {
9321000 self . lockouts = vote_state;
9331001 self . do_initialize_lockouts ( root, |v| v. slot > root) ;
9341002 trace ! (
935- "{} lockouts initialized to {:? }" ,
1003+ "Lockouts in tower for {} is initialized using bank { }" ,
9361004 self . node_pubkey,
937- self . lockouts
1005+ bank . slot ( ) ,
9381006 ) ;
9391007 assert_eq ! (
9401008 self . lockouts. node_pubkey, self . node_pubkey,
@@ -986,6 +1054,7 @@ impl Tower {
9861054 bincode:: serialize_into ( & mut file, & saved_tower) ?;
9871055 // file.sync_all() hurts performance; pipeline sync-ing and submitting votes to the cluster!
9881056 }
1057+ trace ! ( "persisted votes: {:?}" , self . voted_slots( ) ) ;
9891058 fs:: rename ( & new_filename, & filename) ?;
9901059 // self.path.parent().sync_all() hurts performance same as the above sync
9911060
@@ -1047,6 +1116,16 @@ pub enum TowerError {
10471116 FatallyInconsistent ( & ' static str ) ,
10481117}
10491118
1119+ impl TowerError {
1120+ pub fn is_file_missing ( & self ) -> bool {
1121+ if let TowerError :: IOError ( io_err) = & self {
1122+ io_err. kind ( ) == std:: io:: ErrorKind :: NotFound
1123+ } else {
1124+ false
1125+ }
1126+ }
1127+ }
1128+
10501129#[ frozen_abi( digest = "Gaxfwvx5MArn52mKZQgzHmDCyn5YfCuTHvp5Et3rFfpp" ) ]
10511130#[ derive( Default , Clone , Serialize , Deserialize , Debug , PartialEq , AbiExample ) ]
10521131pub struct SavedTower {
@@ -1267,7 +1346,7 @@ pub mod test {
12671346 & ancestors,
12681347 & descendants,
12691348 & self . progress ,
1270- & tower,
1349+ tower,
12711350 ) ;
12721351
12731352 // Make sure this slot isn't locked out or failing threshold
@@ -1464,11 +1543,11 @@ pub mod test {
14641543 #[ test]
14651544 fn test_to_vote_instruction ( ) {
14661545 let vote = Vote :: default ( ) ;
1467- let mut decision = SwitchForkDecision :: FailedSwitchThreshold ;
1546+ let mut decision = SwitchForkDecision :: FailedSwitchThreshold ( 0 , 1 ) ;
14681547 assert ! ( decision
14691548 . to_vote_instruction( vote. clone( ) , & Pubkey :: default ( ) , & Pubkey :: default ( ) )
14701549 . is_none( ) ) ;
1471- decision = SwitchForkDecision :: NoSwitch ;
1550+ decision = SwitchForkDecision :: SameFork ;
14721551 assert_eq ! (
14731552 decision. to_vote_instruction( vote. clone( ) , & Pubkey :: default ( ) , & Pubkey :: default ( ) ) ,
14741553 Some ( vote_instruction:: vote(
@@ -1571,7 +1650,7 @@ pub mod test {
15711650 total_stake,
15721651 bank0. epoch_vote_accounts( 0 ) . unwrap( ) ,
15731652 ) ,
1574- SwitchForkDecision :: NoSwitch
1653+ SwitchForkDecision :: SameFork
15751654 ) ;
15761655
15771656 // Trying to switch to another fork at 110 should fail
@@ -1584,7 +1663,7 @@ pub mod test {
15841663 total_stake,
15851664 bank0. epoch_vote_accounts( 0 ) . unwrap( ) ,
15861665 ) ,
1587- SwitchForkDecision :: FailedSwitchThreshold
1666+ SwitchForkDecision :: FailedSwitchThreshold ( 0 , 20000 )
15881667 ) ;
15891668
15901669 // Adding another validator lockout on a descendant of last vote should
@@ -1599,7 +1678,7 @@ pub mod test {
15991678 total_stake,
16001679 bank0. epoch_vote_accounts( 0 ) . unwrap( ) ,
16011680 ) ,
1602- SwitchForkDecision :: FailedSwitchThreshold
1681+ SwitchForkDecision :: FailedSwitchThreshold ( 0 , 20000 )
16031682 ) ;
16041683
16051684 // Adding another validator lockout on an ancestor of last vote should
@@ -1614,7 +1693,7 @@ pub mod test {
16141693 total_stake,
16151694 bank0. epoch_vote_accounts( 0 ) . unwrap( ) ,
16161695 ) ,
1617- SwitchForkDecision :: FailedSwitchThreshold
1696+ SwitchForkDecision :: FailedSwitchThreshold ( 0 , 20000 )
16181697 ) ;
16191698
16201699 // Adding another validator lockout on a different fork, but the lockout
@@ -1629,7 +1708,7 @@ pub mod test {
16291708 total_stake,
16301709 bank0. epoch_vote_accounts( 0 ) . unwrap( ) ,
16311710 ) ,
1632- SwitchForkDecision :: FailedSwitchThreshold
1711+ SwitchForkDecision :: FailedSwitchThreshold ( 0 , 20000 )
16331712 ) ;
16341713
16351714 // Adding another validator lockout on a different fork, and the lockout
@@ -1646,7 +1725,7 @@ pub mod test {
16461725 total_stake,
16471726 bank0. epoch_vote_accounts( 0 ) . unwrap( ) ,
16481727 ) ,
1649- SwitchForkDecision :: FailedSwitchThreshold
1728+ SwitchForkDecision :: FailedSwitchThreshold ( 0 , 20000 )
16501729 ) ;
16511730
16521731 // Adding another validator lockout on a different fork, and the lockout
@@ -1697,7 +1776,7 @@ pub mod test {
16971776 total_stake,
16981777 bank0. epoch_vote_accounts( 0 ) . unwrap( ) ,
16991778 ) ,
1700- SwitchForkDecision :: FailedSwitchThreshold
1779+ SwitchForkDecision :: FailedSwitchThreshold ( 0 , 20000 )
17011780 ) ;
17021781 }
17031782
@@ -2365,7 +2444,7 @@ pub mod test {
23652444 total_stake,
23662445 bank0. epoch_vote_accounts( 0 ) . unwrap( ) ,
23672446 ) ,
2368- SwitchForkDecision :: NoSwitch
2447+ SwitchForkDecision :: SameFork
23692448 ) ;
23702449
23712450 // Trying to switch to another fork at 110 should fail
@@ -2378,7 +2457,7 @@ pub mod test {
23782457 total_stake,
23792458 bank0. epoch_vote_accounts( 0 ) . unwrap( ) ,
23802459 ) ,
2381- SwitchForkDecision :: FailedSwitchThreshold
2460+ SwitchForkDecision :: FailedSwitchThreshold ( 0 , 20000 )
23822461 ) ;
23832462
23842463 vote_simulator. simulate_lockout_interval ( 111 , ( 10 , 49 ) , & other_vote_account) ;
@@ -2456,7 +2535,7 @@ pub mod test {
24562535 total_stake,
24572536 bank0. epoch_vote_accounts( 0 ) . unwrap( ) ,
24582537 ) ,
2459- SwitchForkDecision :: FailedSwitchThreshold
2538+ SwitchForkDecision :: FailedSwitchThreshold ( 0 , 20000 )
24602539 ) ;
24612540
24622541 // Add lockout_interval which should be excluded
@@ -2470,7 +2549,7 @@ pub mod test {
24702549 total_stake,
24712550 bank0. epoch_vote_accounts( 0 ) . unwrap( ) ,
24722551 ) ,
2473- SwitchForkDecision :: FailedSwitchThreshold
2552+ SwitchForkDecision :: FailedSwitchThreshold ( 0 , 20000 )
24742553 ) ;
24752554
24762555 // Add lockout_interval which should not be excluded
0 commit comments