Skip to content

Commit 39ae4e0

Browse files
ryoqunTyera Eulberg
authored andcommitted
Better tower logs for SwitchForkDecision and etc (solana-labs#12875)
* Better tower logs for SwitchForkDecision and etc * nits * Update comment
1 parent 7f87514 commit 39ae4e0

File tree

3 files changed

+115
-41
lines changed

3 files changed

+115
-41
lines changed

core/src/consensus.rs

Lines changed: 108 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ use std::{
3030
};
3131
use thiserror::Error;
3232

33-
#[derive(PartialEq, Clone, Debug)]
33+
#[derive(PartialEq, Clone, Debug, AbiExample)]
3434
pub enum SwitchForkDecision {
3535
SwitchProof(Hash),
36-
NoSwitch,
37-
FailedSwitchThreshold,
36+
SameFork,
37+
FailedSwitchThreshold(u64, u64),
3838
}
3939

4040
impl SwitchForkDecision {
@@ -45,8 +45,11 @@ impl SwitchForkDecision {
4545
authorized_voter_pubkey: &Pubkey,
4646
) -> Option<Instruction> {
4747
match self {
48-
SwitchForkDecision::FailedSwitchThreshold => None,
49-
SwitchForkDecision::NoSwitch => Some(vote_instruction::vote(
48+
SwitchForkDecision::FailedSwitchThreshold(_, total_stake) => {
49+
assert_ne!(*total_stake, 0);
50+
None
51+
}
52+
SwitchForkDecision::SameFork => Some(vote_instruction::vote(
5053
vote_account_pubkey,
5154
authorized_voter_pubkey,
5255
vote,
@@ -61,6 +64,10 @@ impl SwitchForkDecision {
6164
}
6265
}
6366
}
67+
68+
pub fn can_vote(&self) -> bool {
69+
!matches!(self, SwitchForkDecision::FailedSwitchThreshold(_, _))
70+
}
6471
}
6572

6673
pub const VOTE_THRESHOLD_DEPTH: usize = 8;
@@ -101,6 +108,8 @@ pub struct Tower {
101108
// This could be emptied after some time; but left intact indefinitely for easier
102109
// implementation
103110
stray_restored_slot: Option<Slot>,
111+
#[serde(skip)]
112+
pub last_switch_threshold_check: Option<(Slot, SwitchForkDecision)>,
104113
}
105114

106115
impl Default for Tower {
@@ -115,6 +124,7 @@ impl Default for Tower {
115124
path: PathBuf::default(),
116125
tmp_path: PathBuf::default(),
117126
stray_restored_slot: Option::default(),
127+
last_switch_threshold_check: Option::default(),
118128
};
119129
// VoteState::root_slot is ensured to be Some in Tower
120130
tower.lockouts.root_slot = Some(Slot::default());
@@ -493,7 +503,7 @@ impl Tower {
493503
false
494504
}
495505

496-
pub(crate) fn check_switch_threshold(
506+
fn make_check_switch_threshold_decision(
497507
&self,
498508
switch_slot: u64,
499509
ancestors: &HashMap<Slot, HashSet<u64>>,
@@ -520,9 +530,34 @@ impl Tower {
520530
// all of them.
521531
panic!("no ancestors found with slot: {}", last_voted_slot);
522532
} else {
523-
// bank_forks doesn't have corresponding data for the stray restored last vote,
524-
// meaning some inconsistency between saved tower and ledger.
525-
// (newer snapshot, or only a saved tower is moved over to new setup?)
533+
// This condition shouldn't occur under normal validator operation, indicating
534+
// something unusual happened.
535+
// Possible causes include: OS/HW crash, validator process crash, only saved tower
536+
// is moved over to a new setup, etc...
537+
538+
// However, returning empty ancestors as a fallback here shouldn't result in
539+
// slashing by itself (Note that we couldn't fully preclude any kind of slashing if
540+
// the failure was OS or HW level).
541+
542+
// Firstly, lockout is ensured elsewhere.
543+
544+
// Also, there is no risk of optimistic conf. violation. Although empty ancestors
545+
// could result in incorrect (= more than actual) locked_out_stake and
546+
// false-positive SwitchProof later in this function, there should be no such a
547+
// heavier fork candidate, first of all, if the last vote (or any of its
548+
// unavailable ancestors) were already optimistically confirmed.
549+
// The only exception is that other validator is already violating it...
550+
if self.is_first_switch_check() && switch_slot < last_voted_slot {
551+
// `switch < last` is needed not to warn! this message just because of using
552+
// newer snapshots on validator restart
553+
let message = format!(
554+
"bank_forks doesn't have corresponding data for the stray restored \
555+
last vote({}), meaning some inconsistency between saved tower and ledger.",
556+
last_voted_slot
557+
);
558+
warn!("{}", message);
559+
datapoint_warn!("tower_warn", ("warn", message, String));
560+
}
526561
&empty_ancestors
527562
}
528563
});
@@ -532,7 +567,7 @@ impl Tower {
532567
if switch_slot == last_voted_slot || switch_slot_ancestors.contains(&last_voted_slot) {
533568
// If the `switch_slot is a descendant of the last vote,
534569
// no switching proof is necessary
535-
return SwitchForkDecision::NoSwitch;
570+
return SwitchForkDecision::SameFork;
536571
}
537572

538573
// Should never consider switching to an ancestor
@@ -598,7 +633,7 @@ impl Tower {
598633
}
599634

600635
// Only count lockouts on slots that are:
601-
// 1) Not ancestors of `last_vote`
636+
// 1) Not ancestors of `last_vote`, meaning being on different fork
602637
// 2) Not from before the current root as we can't determine if
603638
// anything before the root was an ancestor of `last_vote` or not
604639
if !last_vote_ancestors.contains(lockout_interval_start)
@@ -622,10 +657,43 @@ impl Tower {
622657
if (locked_out_stake as f64 / total_stake as f64) > SWITCH_FORK_THRESHOLD {
623658
SwitchForkDecision::SwitchProof(switch_proof)
624659
} else {
625-
SwitchForkDecision::FailedSwitchThreshold
660+
SwitchForkDecision::FailedSwitchThreshold(locked_out_stake, total_stake)
626661
}
627662
})
628-
.unwrap_or(SwitchForkDecision::NoSwitch)
663+
.unwrap_or(SwitchForkDecision::SameFork)
664+
}
665+
666+
pub(crate) fn check_switch_threshold(
667+
&mut self,
668+
switch_slot: u64,
669+
ancestors: &HashMap<Slot, HashSet<u64>>,
670+
descendants: &HashMap<Slot, HashSet<u64>>,
671+
progress: &ProgressMap,
672+
total_stake: u64,
673+
epoch_vote_accounts: &HashMap<Pubkey, (u64, Account)>,
674+
) -> SwitchForkDecision {
675+
let decision = self.make_check_switch_threshold_decision(
676+
switch_slot,
677+
ancestors,
678+
descendants,
679+
progress,
680+
total_stake,
681+
epoch_vote_accounts,
682+
);
683+
let new_check = Some((switch_slot, decision.clone()));
684+
if new_check != self.last_switch_threshold_check {
685+
trace!(
686+
"new switch threshold check: slot {}: {:?}",
687+
switch_slot,
688+
decision,
689+
);
690+
self.last_switch_threshold_check = new_check;
691+
}
692+
decision
693+
}
694+
695+
fn is_first_switch_check(&self) -> bool {
696+
self.last_switch_threshold_check.is_none()
629697
}
630698

631699
pub fn check_vote_stake_threshold(
@@ -932,9 +1000,9 @@ impl Tower {
9321000
self.lockouts = vote_state;
9331001
self.do_initialize_lockouts(root, |v| v.slot > root);
9341002
trace!(
935-
"{} lockouts initialized to {:?}",
1003+
"Lockouts in tower for {} is initialized using bank {}",
9361004
self.node_pubkey,
937-
self.lockouts
1005+
bank.slot(),
9381006
);
9391007
assert_eq!(
9401008
self.lockouts.node_pubkey, self.node_pubkey,
@@ -986,6 +1054,7 @@ impl Tower {
9861054
bincode::serialize_into(&mut file, &saved_tower)?;
9871055
// file.sync_all() hurts performance; pipeline sync-ing and submitting votes to the cluster!
9881056
}
1057+
trace!("persisted votes: {:?}", self.voted_slots());
9891058
fs::rename(&new_filename, &filename)?;
9901059
// self.path.parent().sync_all() hurts performance same as the above sync
9911060

@@ -1047,6 +1116,16 @@ pub enum TowerError {
10471116
FatallyInconsistent(&'static str),
10481117
}
10491118

1119+
impl TowerError {
1120+
pub fn is_file_missing(&self) -> bool {
1121+
if let TowerError::IOError(io_err) = &self {
1122+
io_err.kind() == std::io::ErrorKind::NotFound
1123+
} else {
1124+
false
1125+
}
1126+
}
1127+
}
1128+
10501129
#[frozen_abi(digest = "Gaxfwvx5MArn52mKZQgzHmDCyn5YfCuTHvp5Et3rFfpp")]
10511130
#[derive(Default, Clone, Serialize, Deserialize, Debug, PartialEq, AbiExample)]
10521131
pub struct SavedTower {
@@ -1267,7 +1346,7 @@ pub mod test {
12671346
&ancestors,
12681347
&descendants,
12691348
&self.progress,
1270-
&tower,
1349+
tower,
12711350
);
12721351

12731352
// Make sure this slot isn't locked out or failing threshold
@@ -1464,11 +1543,11 @@ pub mod test {
14641543
#[test]
14651544
fn test_to_vote_instruction() {
14661545
let vote = Vote::default();
1467-
let mut decision = SwitchForkDecision::FailedSwitchThreshold;
1546+
let mut decision = SwitchForkDecision::FailedSwitchThreshold(0, 1);
14681547
assert!(decision
14691548
.to_vote_instruction(vote.clone(), &Pubkey::default(), &Pubkey::default())
14701549
.is_none());
1471-
decision = SwitchForkDecision::NoSwitch;
1550+
decision = SwitchForkDecision::SameFork;
14721551
assert_eq!(
14731552
decision.to_vote_instruction(vote.clone(), &Pubkey::default(), &Pubkey::default()),
14741553
Some(vote_instruction::vote(
@@ -1571,7 +1650,7 @@ pub mod test {
15711650
total_stake,
15721651
bank0.epoch_vote_accounts(0).unwrap(),
15731652
),
1574-
SwitchForkDecision::NoSwitch
1653+
SwitchForkDecision::SameFork
15751654
);
15761655

15771656
// Trying to switch to another fork at 110 should fail
@@ -1584,7 +1663,7 @@ pub mod test {
15841663
total_stake,
15851664
bank0.epoch_vote_accounts(0).unwrap(),
15861665
),
1587-
SwitchForkDecision::FailedSwitchThreshold
1666+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
15881667
);
15891668

15901669
// Adding another validator lockout on a descendant of last vote should
@@ -1599,7 +1678,7 @@ pub mod test {
15991678
total_stake,
16001679
bank0.epoch_vote_accounts(0).unwrap(),
16011680
),
1602-
SwitchForkDecision::FailedSwitchThreshold
1681+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
16031682
);
16041683

16051684
// Adding another validator lockout on an ancestor of last vote should
@@ -1614,7 +1693,7 @@ pub mod test {
16141693
total_stake,
16151694
bank0.epoch_vote_accounts(0).unwrap(),
16161695
),
1617-
SwitchForkDecision::FailedSwitchThreshold
1696+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
16181697
);
16191698

16201699
// Adding another validator lockout on a different fork, but the lockout
@@ -1629,7 +1708,7 @@ pub mod test {
16291708
total_stake,
16301709
bank0.epoch_vote_accounts(0).unwrap(),
16311710
),
1632-
SwitchForkDecision::FailedSwitchThreshold
1711+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
16331712
);
16341713

16351714
// Adding another validator lockout on a different fork, and the lockout
@@ -1646,7 +1725,7 @@ pub mod test {
16461725
total_stake,
16471726
bank0.epoch_vote_accounts(0).unwrap(),
16481727
),
1649-
SwitchForkDecision::FailedSwitchThreshold
1728+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
16501729
);
16511730

16521731
// Adding another validator lockout on a different fork, and the lockout
@@ -1697,7 +1776,7 @@ pub mod test {
16971776
total_stake,
16981777
bank0.epoch_vote_accounts(0).unwrap(),
16991778
),
1700-
SwitchForkDecision::FailedSwitchThreshold
1779+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
17011780
);
17021781
}
17031782

@@ -2365,7 +2444,7 @@ pub mod test {
23652444
total_stake,
23662445
bank0.epoch_vote_accounts(0).unwrap(),
23672446
),
2368-
SwitchForkDecision::NoSwitch
2447+
SwitchForkDecision::SameFork
23692448
);
23702449

23712450
// Trying to switch to another fork at 110 should fail
@@ -2378,7 +2457,7 @@ pub mod test {
23782457
total_stake,
23792458
bank0.epoch_vote_accounts(0).unwrap(),
23802459
),
2381-
SwitchForkDecision::FailedSwitchThreshold
2460+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
23822461
);
23832462

23842463
vote_simulator.simulate_lockout_interval(111, (10, 49), &other_vote_account);
@@ -2456,7 +2535,7 @@ pub mod test {
24562535
total_stake,
24572536
bank0.epoch_vote_accounts(0).unwrap(),
24582537
),
2459-
SwitchForkDecision::FailedSwitchThreshold
2538+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
24602539
);
24612540

24622541
// Add lockout_interval which should be excluded
@@ -2470,7 +2549,7 @@ pub mod test {
24702549
total_stake,
24712550
bank0.epoch_vote_accounts(0).unwrap(),
24722551
),
2473-
SwitchForkDecision::FailedSwitchThreshold
2552+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
24742553
);
24752554

24762555
// Add lockout_interval which should not be excluded

core/src/replay_stage.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ impl ReplayStage {
408408
&ancestors,
409409
&descendants,
410410
&progress,
411-
&tower,
411+
&mut tower,
412412
);
413413
select_vote_and_reset_forks_time.stop();
414414

@@ -1525,7 +1525,7 @@ impl ReplayStage {
15251525
ancestors: &HashMap<u64, HashSet<u64>>,
15261526
descendants: &HashMap<u64, HashSet<u64>>,
15271527
progress: &ProgressMap,
1528-
tower: &Tower,
1528+
tower: &mut Tower,
15291529
) -> SelectVoteAndResetForkResult {
15301530
// Try to vote on the actual heaviest fork. If the heaviest bank is
15311531
// locked out or fails the threshold check, the validator will:
@@ -1552,7 +1552,7 @@ impl ReplayStage {
15521552
.epoch_vote_accounts(heaviest_bank.epoch())
15531553
.expect("Bank epoch vote accounts must contain entry for the bank's own epoch"),
15541554
);
1555-
if switch_fork_decision == SwitchForkDecision::FailedSwitchThreshold {
1555+
if let SwitchForkDecision::FailedSwitchThreshold(_, _) = switch_fork_decision {
15561556
// If we can't switch, then reset to the the next votable
15571557
// bank on the same fork as our last vote, but don't vote
15581558
info!(
@@ -1601,7 +1601,7 @@ impl ReplayStage {
16011601
if !is_locked_out
16021602
&& vote_threshold
16031603
&& propagation_confirmed
1604-
&& switch_fork_decision != SwitchForkDecision::FailedSwitchThreshold
1604+
&& switch_fork_decision.can_vote()
16051605
{
16061606
info!("voting: {} {}", bank.slot(), fork_weight);
16071607
SelectVoteAndResetForkResult {

core/src/validator.rs

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::{
66
cluster_info::{ClusterInfo, Node},
77
cluster_info_vote_listener::VoteTracker,
88
completed_data_sets_service::CompletedDataSetsService,
9-
consensus::{reconcile_blockstore_roots_with_tower, Tower, TowerError},
9+
consensus::{reconcile_blockstore_roots_with_tower, Tower},
1010
contact_info::ContactInfo,
1111
gossip_service::GossipService,
1212
optimistically_confirmed_bank_tracker::{
@@ -720,12 +720,7 @@ fn post_process_restored_tower(
720720
.unwrap_or_else(|err| {
721721
let voting_has_been_active =
722722
active_vote_account_exists_in_bank(&bank_forks.working_bank(), &vote_account);
723-
let saved_tower_is_missing = if let TowerError::IOError(io_err) = &err {
724-
io_err.kind() == std::io::ErrorKind::NotFound
725-
} else {
726-
false
727-
};
728-
if !saved_tower_is_missing {
723+
if !err.is_file_missing() {
729724
datapoint_error!(
730725
"tower_error",
731726
(
@@ -743,7 +738,7 @@ fn post_process_restored_tower(
743738
);
744739
process::exit(1);
745740
}
746-
if saved_tower_is_missing && !voting_has_been_active {
741+
if err.is_file_missing() && !voting_has_been_active {
747742
// Currently, don't protect against spoofed snapshots with no tower at all
748743
info!(
749744
"Ignoring expected failed tower restore because this is the initial \

0 commit comments

Comments
 (0)