Skip to content
This repository was archived by the owner on Jan 22, 2025. It is now read-only.

Commit 3f5ecc9

Browse files
committed
Add a test for opt. conf violation without tower
1 parent 7344916 commit 3f5ecc9

File tree

6 files changed

+334
-134
lines changed

6 files changed

+334
-134
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

core/src/consensus.rs

Lines changed: 69 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ use std::{
3030
};
3131
use thiserror::Error;
3232

33-
#[derive(PartialEq, Clone, Debug)]
33+
#[derive(PartialEq, Clone, Debug, AbiExample)]
3434
pub enum SwitchForkDecision {
3535
SwitchProof(Hash),
36-
NoSwitch,
37-
FailedSwitchThreshold,
36+
SameFork,
37+
FailedSwitchThreshold(u64, u64),
3838
}
3939

4040
impl SwitchForkDecision {
@@ -45,8 +45,8 @@ impl SwitchForkDecision {
4545
authorized_voter_pubkey: &Pubkey,
4646
) -> Option<Instruction> {
4747
match self {
48-
SwitchForkDecision::FailedSwitchThreshold => None,
49-
SwitchForkDecision::NoSwitch => Some(vote_instruction::vote(
48+
SwitchForkDecision::FailedSwitchThreshold(_, _) => None,
49+
SwitchForkDecision::SameFork => Some(vote_instruction::vote(
5050
vote_account_pubkey,
5151
authorized_voter_pubkey,
5252
vote,
@@ -101,6 +101,8 @@ pub struct Tower {
101101
// This could be emptied after some time; but left intact indefinitely for easier
102102
// implementation
103103
stray_restored_slot: Option<Slot>,
104+
#[serde(skip)]
105+
pub last_switch_threshold_check: Option<(Slot, SwitchForkDecision)>,
104106
}
105107

106108
impl Default for Tower {
@@ -115,6 +117,7 @@ impl Default for Tower {
115117
path: PathBuf::default(),
116118
tmp_path: PathBuf::default(),
117119
stray_restored_slot: Option::default(),
120+
last_switch_threshold_check: Option::default(),
118121
};
119122
// VoteState::root_slot is ensured to be Some in Tower
120123
tower.lockouts.root_slot = Some(Slot::default());
@@ -440,6 +443,10 @@ impl Tower {
440443
self.lockouts.root_slot.unwrap()
441444
}
442445

446+
pub fn node_pubkey(&self) -> Pubkey {
447+
self.node_pubkey
448+
}
449+
443450
// a slot is recent if it's newer than the last vote we have
444451
pub fn is_recent(&self, slot: Slot) -> bool {
445452
if let Some(last_voted_slot) = self.lockouts.last_voted_slot() {
@@ -520,9 +527,16 @@ impl Tower {
520527
// all of them.
521528
panic!("no ancestors found with slot: {}", last_voted_slot);
522529
} else {
523-
// bank_forks doesn't have corresponding data for the stray restored last vote,
524-
// meaning some inconsistency between saved tower and ledger.
525-
// (newer snapshot, or only a saved tower is moved over to new setup?)
530+
// compare slots not to error! just because of newer snapshots
531+
if self.last_switch_threshold_check.is_none() && switch_slot < last_voted_slot {
532+
error!(
533+
"{} bank_forks doesn't have corresponding data for the stray restored \
534+
last vote({}), meaning some inconsistency between saved tower and ledger.",
535+
self.node_pubkey,
536+
last_voted_slot
537+
);
538+
}
539+
// (system crash? process crash? newer snapshot, or only a saved tower is moved over to new setup?)
526540
&empty_ancestors
527541
}
528542
});
@@ -532,12 +546,22 @@ impl Tower {
532546
if switch_slot == last_voted_slot || switch_slot_ancestors.contains(&last_voted_slot) {
533547
// If the `switch_slot is a descendant of the last vote,
534548
// no switching proof is necessary
535-
return SwitchForkDecision::NoSwitch;
549+
return SwitchForkDecision::SameFork;
536550
}
537551

538-
// Should never consider switching to an ancestor
539-
// of your last vote
540-
assert!(!last_vote_ancestors.contains(&switch_slot));
552+
// Generally, should never consider switching to an ancestor of your last vote
553+
if last_vote_ancestors.contains(&switch_slot) {
554+
let restored_stray_exception = self.is_stray_last_vote() && switch_slot < last_voted_slot;
555+
if !restored_stray_exception {
556+
panic!("Don't switch to slot ({}), which is ancestors({:?}) of last vote: {}", switch_slot, last_vote_ancestors, last_voted_slot);
557+
} else {
558+
// If last vote is stray, it's possible to reach here, because we don't have complete view of voting history.
559+
// In that case, pretend we're always failing to switch, regarless switch_slot is actually on same frok or not.
560+
// this is for safety; otherwise we accidentally might doubl#e-vte on ancestors.
561+
// siwtch_slot must be `< last_voted_slot` because switch_slot must be one of last_vote_ancestors in this if clause.
562+
return SwitchForkDecision::FailedSwitchThreshold(0,0);
563+
}
564+
}
541565

542566
// By this point, we know the `switch_slot` is on a different fork
543567
// (is neither an ancestor nor descendant of `last_vote`), so a
@@ -622,10 +646,10 @@ impl Tower {
622646
if (locked_out_stake as f64 / total_stake as f64) > SWITCH_FORK_THRESHOLD {
623647
SwitchForkDecision::SwitchProof(switch_proof)
624648
} else {
625-
SwitchForkDecision::FailedSwitchThreshold
649+
SwitchForkDecision::FailedSwitchThreshold(locked_out_stake, total_stake)
626650
}
627651
})
628-
.unwrap_or(SwitchForkDecision::NoSwitch)
652+
.unwrap_or(SwitchForkDecision::SameFork)
629653
}
630654

631655
pub fn check_vote_stake_threshold(
@@ -931,9 +955,9 @@ impl Tower {
931955
self.lockouts = vote_state;
932956
self.do_initialize_lockouts(root, |v| v.slot > root);
933957
trace!(
934-
"{} lockouts initialized to {:?}",
958+
"Lockouts in tower for {} is initialized using bank {}",
935959
self.node_pubkey,
936-
self.lockouts
960+
bank.slot(),
937961
);
938962
assert_eq!(
939963
self.lockouts.node_pubkey, self.node_pubkey,
@@ -985,6 +1009,11 @@ impl Tower {
9851009
bincode::serialize_into(&mut file, &saved_tower)?;
9861010
// file.sync_all() hurts performance; pipeline sync-ing and submitting votes to the cluster!
9871011
}
1012+
trace!(
1013+
"{} persisted votes: {:?}",
1014+
node_keypair.pubkey(),
1015+
self.voted_slots()
1016+
);
9881017
fs::rename(&new_filename, &filename)?;
9891018
// self.path.parent().sync_all() hurts performance same as the above sync
9901019

@@ -1046,6 +1075,16 @@ pub enum TowerError {
10461075
FatallyInconsistent(&'static str),
10471076
}
10481077

1078+
impl TowerError {
1079+
pub fn is_file_missing(&self) -> bool {
1080+
if let TowerError::IOError(io_err) = &self {
1081+
io_err.kind() == std::io::ErrorKind::NotFound
1082+
} else {
1083+
false
1084+
}
1085+
}
1086+
}
1087+
10491088
#[frozen_abi(digest = "Gaxfwvx5MArn52mKZQgzHmDCyn5YfCuTHvp5Et3rFfpp")]
10501089
#[derive(Default, Clone, Serialize, Deserialize, Debug, PartialEq, AbiExample)]
10511090
pub struct SavedTower {
@@ -1268,7 +1307,7 @@ pub mod test {
12681307
&ancestors,
12691308
&descendants,
12701309
&self.progress,
1271-
&tower,
1310+
tower,
12721311
);
12731312

12741313
// Make sure this slot isn't locked out or failing threshold
@@ -1465,11 +1504,11 @@ pub mod test {
14651504
#[test]
14661505
fn test_to_vote_instruction() {
14671506
let vote = Vote::default();
1468-
let mut decision = SwitchForkDecision::FailedSwitchThreshold;
1507+
let mut decision = SwitchForkDecision::FailedSwitchThreshold(0, 0);
14691508
assert!(decision
14701509
.to_vote_instruction(vote.clone(), &Pubkey::default(), &Pubkey::default())
14711510
.is_none());
1472-
decision = SwitchForkDecision::NoSwitch;
1511+
decision = SwitchForkDecision::SameFork;
14731512
assert_eq!(
14741513
decision.to_vote_instruction(vote.clone(), &Pubkey::default(), &Pubkey::default()),
14751514
Some(vote_instruction::vote(
@@ -1572,7 +1611,7 @@ pub mod test {
15721611
total_stake,
15731612
bank0.epoch_vote_accounts(0).unwrap(),
15741613
),
1575-
SwitchForkDecision::NoSwitch
1614+
SwitchForkDecision::SameFork
15761615
);
15771616

15781617
// Trying to switch to another fork at 110 should fail
@@ -1585,7 +1624,7 @@ pub mod test {
15851624
total_stake,
15861625
bank0.epoch_vote_accounts(0).unwrap(),
15871626
),
1588-
SwitchForkDecision::FailedSwitchThreshold
1627+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
15891628
);
15901629

15911630
// Adding another validator lockout on a descendant of last vote should
@@ -1600,7 +1639,7 @@ pub mod test {
16001639
total_stake,
16011640
bank0.epoch_vote_accounts(0).unwrap(),
16021641
),
1603-
SwitchForkDecision::FailedSwitchThreshold
1642+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
16041643
);
16051644

16061645
// Adding another validator lockout on an ancestor of last vote should
@@ -1615,7 +1654,7 @@ pub mod test {
16151654
total_stake,
16161655
bank0.epoch_vote_accounts(0).unwrap(),
16171656
),
1618-
SwitchForkDecision::FailedSwitchThreshold
1657+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
16191658
);
16201659

16211660
// Adding another validator lockout on a different fork, but the lockout
@@ -1630,7 +1669,7 @@ pub mod test {
16301669
total_stake,
16311670
bank0.epoch_vote_accounts(0).unwrap(),
16321671
),
1633-
SwitchForkDecision::FailedSwitchThreshold
1672+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
16341673
);
16351674

16361675
// Adding another validator lockout on a different fork, and the lockout
@@ -1647,7 +1686,7 @@ pub mod test {
16471686
total_stake,
16481687
bank0.epoch_vote_accounts(0).unwrap(),
16491688
),
1650-
SwitchForkDecision::FailedSwitchThreshold
1689+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
16511690
);
16521691

16531692
// Adding another validator lockout on a different fork, and the lockout
@@ -1698,7 +1737,7 @@ pub mod test {
16981737
total_stake,
16991738
bank0.epoch_vote_accounts(0).unwrap(),
17001739
),
1701-
SwitchForkDecision::FailedSwitchThreshold
1740+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
17021741
);
17031742
}
17041743

@@ -2366,7 +2405,7 @@ pub mod test {
23662405
total_stake,
23672406
bank0.epoch_vote_accounts(0).unwrap(),
23682407
),
2369-
SwitchForkDecision::NoSwitch
2408+
SwitchForkDecision::SameFork
23702409
);
23712410

23722411
// Trying to switch to another fork at 110 should fail
@@ -2379,7 +2418,7 @@ pub mod test {
23792418
total_stake,
23802419
bank0.epoch_vote_accounts(0).unwrap(),
23812420
),
2382-
SwitchForkDecision::FailedSwitchThreshold
2421+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
23832422
);
23842423

23852424
vote_simulator.simulate_lockout_interval(111, (10, 49), &other_vote_account);
@@ -2457,7 +2496,7 @@ pub mod test {
24572496
total_stake,
24582497
bank0.epoch_vote_accounts(0).unwrap(),
24592498
),
2460-
SwitchForkDecision::FailedSwitchThreshold
2499+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
24612500
);
24622501

24632502
// Add lockout_interval which should be excluded
@@ -2471,7 +2510,7 @@ pub mod test {
24712510
total_stake,
24722511
bank0.epoch_vote_accounts(0).unwrap(),
24732512
),
2474-
SwitchForkDecision::FailedSwitchThreshold
2513+
SwitchForkDecision::FailedSwitchThreshold(0, 20000)
24752514
);
24762515

24772516
// Add lockout_interval which should not be excluded

core/src/replay_stage.rs

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ impl ReplayStage {
405405
&ancestors,
406406
&descendants,
407407
&progress,
408-
&tower,
408+
&mut tower,
409409
);
410410
select_vote_and_reset_forks_time.stop();
411411

@@ -1511,7 +1511,7 @@ impl ReplayStage {
15111511
ancestors: &HashMap<u64, HashSet<u64>>,
15121512
descendants: &HashMap<u64, HashSet<u64>>,
15131513
progress: &ProgressMap,
1514-
tower: &Tower,
1514+
tower: &mut Tower,
15151515
) -> SelectVoteAndResetForkResult {
15161516
// Try to vote on the actual heaviest fork. If the heaviest bank is
15171517
// locked out or fails the threshold check, the validator will:
@@ -1538,7 +1538,19 @@ impl ReplayStage {
15381538
.epoch_vote_accounts(heaviest_bank.epoch())
15391539
.expect("Bank epoch vote accounts must contain entry for the bank's own epoch"),
15401540
);
1541-
if switch_fork_decision == SwitchForkDecision::FailedSwitchThreshold {
1541+
let a = Some((heaviest_bank.slot(), switch_fork_decision.clone()));
1542+
if tower.last_switch_threshold_check != a {
1543+
let (slot, choice) = a.clone().unwrap();
1544+
trace!(
1545+
"{}: switch threshold is checked: slot {} (parent: {}): {:?}",
1546+
tower.node_pubkey(),
1547+
slot,
1548+
heaviest_bank.parent_slot(),
1549+
choice
1550+
);
1551+
tower.last_switch_threshold_check = a;
1552+
}
1553+
if let SwitchForkDecision::FailedSwitchThreshold(_, _) = switch_fork_decision {
15421554
// If we can't switch, then reset to the the next votable
15431555
// bank on the same fork as our last vote, but don't vote
15441556
info!(
@@ -1584,11 +1596,13 @@ impl ReplayStage {
15841596
failure_reasons.push(HeaviestForkFailures::NoPropagatedConfirmation(bank.slot()));
15851597
}
15861598

1587-
if !is_locked_out
1588-
&& vote_threshold
1589-
&& propagation_confirmed
1590-
&& switch_fork_decision != SwitchForkDecision::FailedSwitchThreshold
1591-
{
1599+
if let SwitchForkDecision::FailedSwitchThreshold(_, _) = switch_fork_decision {
1600+
SelectVoteAndResetForkResult {
1601+
vote_bank: None,
1602+
reset_bank: Some(bank.clone()),
1603+
heaviest_fork_failures: failure_reasons,
1604+
}
1605+
} else if !is_locked_out && vote_threshold && propagation_confirmed {
15921606
info!("voting: {} {}", bank.slot(), fork_weight);
15931607
SelectVoteAndResetForkResult {
15941608
vote_bank: Some((bank.clone(), switch_fork_decision)),

core/src/validator.rs

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::{
66
cluster_info::{ClusterInfo, Node},
77
cluster_info_vote_listener::VoteTracker,
88
completed_data_sets_service::CompletedDataSetsService,
9-
consensus::{reconcile_blockstore_roots_with_tower, Tower, TowerError},
9+
consensus::{reconcile_blockstore_roots_with_tower, Tower},
1010
contact_info::ContactInfo,
1111
gossip_service::{discover_cluster, GossipService},
1212
poh_recorder::{PohRecorder, GRACE_TICKS_FACTOR, MAX_GRACE_SLOTS},
@@ -644,12 +644,7 @@ fn post_process_restored_tower(
644644
.unwrap_or_else(|err| {
645645
let voting_has_been_active =
646646
active_vote_account_exists_in_bank(&bank_forks.working_bank(), &vote_account);
647-
let saved_tower_is_missing = if let TowerError::IOError(io_err) = &err {
648-
io_err.kind() == std::io::ErrorKind::NotFound
649-
} else {
650-
false
651-
};
652-
if !saved_tower_is_missing {
647+
if !err.is_file_missing() {
653648
datapoint_error!(
654649
"tower_error",
655650
(
@@ -667,7 +662,7 @@ fn post_process_restored_tower(
667662
);
668663
process::exit(1);
669664
}
670-
if saved_tower_is_missing && !voting_has_been_active {
665+
if err.is_file_missing() && !voting_has_been_active {
671666
// Currently, don't protect against spoofed snapshots with no tower at all
672667
info!(
673668
"Ignoring expected failed tower restore because this is the initial \

local-cluster/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ homepage = "https://solana.com/"
1111
[dependencies]
1212
itertools = "0.9.0"
1313
gag = "0.1.10"
14+
fs_extra = "1.1.0"
1415
log = "0.4.8"
1516
rand = "0.7.0"
1617
solana-config-program = { path = "../programs/config", version = "1.4.0" }

0 commit comments

Comments
 (0)