-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Better tower logs for SwitchForkDecision and etc #12875
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,11 +30,11 @@ use std::{ | |
| }; | ||
| use thiserror::Error; | ||
|
|
||
| #[derive(PartialEq, Clone, Debug)] | ||
| #[derive(PartialEq, Clone, Debug, AbiExample)] | ||
| pub enum SwitchForkDecision { | ||
| SwitchProof(Hash), | ||
| NoSwitch, | ||
| FailedSwitchThreshold, | ||
| SameFork, | ||
| FailedSwitchThreshold(u64, u64), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's start passing around locked_out_stake and total_stake for logging. I found this very useful to debug. |
||
| } | ||
|
|
||
| impl SwitchForkDecision { | ||
|
|
@@ -45,8 +45,11 @@ impl SwitchForkDecision { | |
| authorized_voter_pubkey: &Pubkey, | ||
| ) -> Option<Instruction> { | ||
| match self { | ||
| SwitchForkDecision::FailedSwitchThreshold => None, | ||
| SwitchForkDecision::NoSwitch => Some(vote_instruction::vote( | ||
| SwitchForkDecision::FailedSwitchThreshold(_, total_stake) => { | ||
| assert_ne!(*total_stake, 0); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. place some sanity check as a bonus. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also note that, this assumes there is no way to control |
||
| None | ||
| } | ||
| SwitchForkDecision::SameFork => Some(vote_instruction::vote( | ||
| vote_account_pubkey, | ||
| authorized_voter_pubkey, | ||
| vote, | ||
|
|
@@ -61,6 +64,10 @@ impl SwitchForkDecision { | |
| } | ||
| } | ||
| } | ||
|
|
||
| pub fn can_vote(&self) -> bool { | ||
| !matches!(self, SwitchForkDecision::FailedSwitchThreshold(_, _)) | ||
| } | ||
| } | ||
|
|
||
| pub const VOTE_THRESHOLD_DEPTH: usize = 8; | ||
|
|
@@ -101,6 +108,8 @@ pub struct Tower { | |
| // This could be emptied after some time; but left intact indefinitely for easier | ||
| // implementation | ||
| stray_restored_slot: Option<Slot>, | ||
| #[serde(skip)] | ||
| pub last_switch_threshold_check: Option<(Slot, SwitchForkDecision)>, | ||
| } | ||
|
|
||
| impl Default for Tower { | ||
|
|
@@ -115,6 +124,7 @@ impl Default for Tower { | |
| path: PathBuf::default(), | ||
| tmp_path: PathBuf::default(), | ||
| stray_restored_slot: Option::default(), | ||
| last_switch_threshold_check: Option::default(), | ||
| }; | ||
| // VoteState::root_slot is ensured to be Some in Tower | ||
| tower.lockouts.root_slot = Some(Slot::default()); | ||
|
|
@@ -493,7 +503,7 @@ impl Tower { | |
| false | ||
| } | ||
|
|
||
| pub(crate) fn check_switch_threshold( | ||
| fn make_check_switch_threshold_decision( | ||
| &self, | ||
| switch_slot: u64, | ||
| ancestors: &HashMap<Slot, HashSet<u64>>, | ||
|
|
@@ -520,9 +530,34 @@ impl Tower { | |
| // all of them. | ||
| panic!("no ancestors found with slot: {}", last_voted_slot); | ||
| } else { | ||
| // bank_forks doesn't have corresponding data for the stray restored last vote, | ||
| // meaning some inconsistency between saved tower and ledger. | ||
| // (newer snapshot, or only a saved tower is moved over to new setup?) | ||
| // This condition shouldn't occur under normal validator operation, indicating | ||
| // something unusual happened. | ||
|
Comment on lines
+533
to
+534
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the record, this is the well-written proof of this:
And this proof is correct as far as I peer-reviewed this. |
||
| // Possible causes include: OS/HW crash, validator process crash, only saved tower | ||
| // is moved over to a new setup, etc... | ||
|
|
||
| // However, returning empty ancestors as a fallback here shouldn't result in | ||
| // slashing by itself (Note that we couldn't fully preclude any kind of slashing if | ||
| // the failure was OS or HW level). | ||
|
|
||
| // Firstly, lockout is ensured elsewhere. | ||
|
|
||
| // Also, there is no risk of optimistic conf. violation. Although empty ancestors | ||
| // could result in incorrect (= more than actual) locked_out_stake and | ||
| // false-positive SwitchProof later in this function, there should be no such a | ||
| // heavier fork candidate, first of all, if the last vote (or any of its | ||
| // unavailable ancestors) were already optimistically confirmed. | ||
| // The only exception is that other validator is already violating it... | ||
| if self.is_first_switch_check() && switch_slot < last_voted_slot { | ||
ryoqun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // `switch < last` is needed not to warn! this message just because of using | ||
| // newer snapshots on validator restart | ||
| let message = format!( | ||
| "bank_forks doesn't have corresponding data for the stray restored \ | ||
| last vote({}), meaning some inconsistency between saved tower and ledger.", | ||
| last_voted_slot | ||
| ); | ||
| warn!("{}", message); | ||
| datapoint_warn!("tower_warn", ("warn", message, String)); | ||
| } | ||
| &empty_ancestors | ||
| } | ||
| }); | ||
|
|
@@ -532,7 +567,7 @@ impl Tower { | |
| if switch_slot == last_voted_slot || switch_slot_ancestors.contains(&last_voted_slot) { | ||
| // If the `switch_slot is a descendant of the last vote, | ||
| // no switching proof is necessary | ||
| return SwitchForkDecision::NoSwitch; | ||
| return SwitchForkDecision::SameFork; | ||
| } | ||
|
|
||
| // Should never consider switching to an ancestor | ||
|
|
@@ -598,7 +633,7 @@ impl Tower { | |
| } | ||
|
|
||
| // Only count lockouts on slots that are: | ||
| // 1) Not ancestors of `last_vote` | ||
| // 1) Not ancestors of `last_vote`, meaning being on different fork | ||
| // 2) Not from before the current root as we can't determine if | ||
| // anything before the root was an ancestor of `last_vote` or not | ||
| if !last_vote_ancestors.contains(lockout_interval_start) | ||
|
|
@@ -622,10 +657,43 @@ impl Tower { | |
| if (locked_out_stake as f64 / total_stake as f64) > SWITCH_FORK_THRESHOLD { | ||
| SwitchForkDecision::SwitchProof(switch_proof) | ||
| } else { | ||
| SwitchForkDecision::FailedSwitchThreshold | ||
| SwitchForkDecision::FailedSwitchThreshold(locked_out_stake, total_stake) | ||
| } | ||
| }) | ||
| .unwrap_or(SwitchForkDecision::NoSwitch) | ||
| .unwrap_or(SwitchForkDecision::SameFork) | ||
| } | ||
|
|
||
| pub(crate) fn check_switch_threshold( | ||
| &mut self, | ||
| switch_slot: u64, | ||
| ancestors: &HashMap<Slot, HashSet<u64>>, | ||
| descendants: &HashMap<Slot, HashSet<u64>>, | ||
| progress: &ProgressMap, | ||
| total_stake: u64, | ||
| epoch_vote_accounts: &HashMap<Pubkey, (u64, Account)>, | ||
| ) -> SwitchForkDecision { | ||
| let decision = self.make_check_switch_threshold_decision( | ||
| switch_slot, | ||
| ancestors, | ||
| descendants, | ||
| progress, | ||
| total_stake, | ||
| epoch_vote_accounts, | ||
| ); | ||
| let new_check = Some((switch_slot, decision.clone())); | ||
| if new_check != self.last_switch_threshold_check { | ||
| trace!( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just wanted to |
||
| "new switch threshold check: slot {}: {:?}", | ||
| switch_slot, | ||
| decision, | ||
| ); | ||
| self.last_switch_threshold_check = new_check; | ||
| } | ||
| decision | ||
| } | ||
|
|
||
| fn is_first_switch_check(&self) -> bool { | ||
| self.last_switch_threshold_check.is_none() | ||
| } | ||
|
|
||
| pub fn check_vote_stake_threshold( | ||
|
|
@@ -932,9 +1000,9 @@ impl Tower { | |
| self.lockouts = vote_state; | ||
| self.do_initialize_lockouts(root, |v| v.slot > root); | ||
| trace!( | ||
| "{} lockouts initialized to {:?}", | ||
| "Lockouts in tower for {} is initialized using bank {}", | ||
| self.node_pubkey, | ||
| self.lockouts | ||
ryoqun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| bank.slot(), | ||
| ); | ||
| assert_eq!( | ||
| self.lockouts.node_pubkey, self.node_pubkey, | ||
|
|
@@ -986,6 +1054,7 @@ impl Tower { | |
| bincode::serialize_into(&mut file, &saved_tower)?; | ||
| // file.sync_all() hurts performance; pipeline sync-ing and submitting votes to the cluster! | ||
| } | ||
| trace!("persisted votes: {:?}", self.voted_slots()); | ||
| fs::rename(&new_filename, &filename)?; | ||
| // self.path.parent().sync_all() hurts performance same as the above sync | ||
|
|
||
|
|
@@ -1047,6 +1116,16 @@ pub enum TowerError { | |
| FatallyInconsistent(&'static str), | ||
| } | ||
|
|
||
| impl TowerError { | ||
| pub fn is_file_missing(&self) -> bool { | ||
| if let TowerError::IOError(io_err) = &self { | ||
| io_err.kind() == std::io::ErrorKind::NotFound | ||
| } else { | ||
| false | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #[frozen_abi(digest = "Gaxfwvx5MArn52mKZQgzHmDCyn5YfCuTHvp5Et3rFfpp")] | ||
| #[derive(Default, Clone, Serialize, Deserialize, Debug, PartialEq, AbiExample)] | ||
| pub struct SavedTower { | ||
|
|
@@ -1267,7 +1346,7 @@ pub mod test { | |
| &ancestors, | ||
| &descendants, | ||
| &self.progress, | ||
| &tower, | ||
| tower, | ||
| ); | ||
|
|
||
| // Make sure this slot isn't locked out or failing threshold | ||
|
|
@@ -1464,11 +1543,11 @@ pub mod test { | |
| #[test] | ||
| fn test_to_vote_instruction() { | ||
| let vote = Vote::default(); | ||
| let mut decision = SwitchForkDecision::FailedSwitchThreshold; | ||
| let mut decision = SwitchForkDecision::FailedSwitchThreshold(0, 1); | ||
| assert!(decision | ||
| .to_vote_instruction(vote.clone(), &Pubkey::default(), &Pubkey::default()) | ||
| .is_none()); | ||
| decision = SwitchForkDecision::NoSwitch; | ||
| decision = SwitchForkDecision::SameFork; | ||
| assert_eq!( | ||
| decision.to_vote_instruction(vote.clone(), &Pubkey::default(), &Pubkey::default()), | ||
| Some(vote_instruction::vote( | ||
|
|
@@ -1571,7 +1650,7 @@ pub mod test { | |
| total_stake, | ||
| bank0.epoch_vote_accounts(0).unwrap(), | ||
| ), | ||
| SwitchForkDecision::NoSwitch | ||
| SwitchForkDecision::SameFork | ||
| ); | ||
|
|
||
| // Trying to switch to another fork at 110 should fail | ||
|
|
@@ -1584,7 +1663,7 @@ pub mod test { | |
| total_stake, | ||
| bank0.epoch_vote_accounts(0).unwrap(), | ||
| ), | ||
| SwitchForkDecision::FailedSwitchThreshold | ||
| SwitchForkDecision::FailedSwitchThreshold(0, 20000) | ||
| ); | ||
|
|
||
| // Adding another validator lockout on a descendant of last vote should | ||
|
|
@@ -1599,7 +1678,7 @@ pub mod test { | |
| total_stake, | ||
| bank0.epoch_vote_accounts(0).unwrap(), | ||
| ), | ||
| SwitchForkDecision::FailedSwitchThreshold | ||
| SwitchForkDecision::FailedSwitchThreshold(0, 20000) | ||
| ); | ||
|
|
||
| // Adding another validator lockout on an ancestor of last vote should | ||
|
|
@@ -1614,7 +1693,7 @@ pub mod test { | |
| total_stake, | ||
| bank0.epoch_vote_accounts(0).unwrap(), | ||
| ), | ||
| SwitchForkDecision::FailedSwitchThreshold | ||
| SwitchForkDecision::FailedSwitchThreshold(0, 20000) | ||
| ); | ||
|
|
||
| // Adding another validator lockout on a different fork, but the lockout | ||
|
|
@@ -1629,7 +1708,7 @@ pub mod test { | |
| total_stake, | ||
| bank0.epoch_vote_accounts(0).unwrap(), | ||
| ), | ||
| SwitchForkDecision::FailedSwitchThreshold | ||
| SwitchForkDecision::FailedSwitchThreshold(0, 20000) | ||
| ); | ||
|
|
||
| // Adding another validator lockout on a different fork, and the lockout | ||
|
|
@@ -1646,7 +1725,7 @@ pub mod test { | |
| total_stake, | ||
| bank0.epoch_vote_accounts(0).unwrap(), | ||
| ), | ||
| SwitchForkDecision::FailedSwitchThreshold | ||
| SwitchForkDecision::FailedSwitchThreshold(0, 20000) | ||
| ); | ||
|
|
||
| // Adding another validator lockout on a different fork, and the lockout | ||
|
|
@@ -1697,7 +1776,7 @@ pub mod test { | |
| total_stake, | ||
| bank0.epoch_vote_accounts(0).unwrap(), | ||
| ), | ||
| SwitchForkDecision::FailedSwitchThreshold | ||
| SwitchForkDecision::FailedSwitchThreshold(0, 20000) | ||
| ); | ||
| } | ||
|
|
||
|
|
@@ -2365,7 +2444,7 @@ pub mod test { | |
| total_stake, | ||
| bank0.epoch_vote_accounts(0).unwrap(), | ||
| ), | ||
| SwitchForkDecision::NoSwitch | ||
| SwitchForkDecision::SameFork | ||
| ); | ||
|
|
||
| // Trying to switch to another fork at 110 should fail | ||
|
|
@@ -2378,7 +2457,7 @@ pub mod test { | |
| total_stake, | ||
| bank0.epoch_vote_accounts(0).unwrap(), | ||
| ), | ||
| SwitchForkDecision::FailedSwitchThreshold | ||
| SwitchForkDecision::FailedSwitchThreshold(0, 20000) | ||
| ); | ||
|
|
||
| vote_simulator.simulate_lockout_interval(111, (10, 49), &other_vote_account); | ||
|
|
@@ -2456,7 +2535,7 @@ pub mod test { | |
| total_stake, | ||
| bank0.epoch_vote_accounts(0).unwrap(), | ||
| ), | ||
| SwitchForkDecision::FailedSwitchThreshold | ||
| SwitchForkDecision::FailedSwitchThreshold(0, 20000) | ||
| ); | ||
|
|
||
| // Add lockout_interval which should be excluded | ||
|
|
@@ -2470,7 +2549,7 @@ pub mod test { | |
| total_stake, | ||
| bank0.epoch_vote_accounts(0).unwrap(), | ||
| ), | ||
| SwitchForkDecision::FailedSwitchThreshold | ||
| SwitchForkDecision::FailedSwitchThreshold(0, 20000) | ||
| ); | ||
|
|
||
| // Add lockout_interval which should not be excluded | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.