Skip to content

Commit aaad0c4

Browse files
committed
Do not internally retry for load_rack_secret
1 parent 1ac30a3 commit aaad0c4

File tree

1 file changed

+79
-67
lines changed

1 file changed

+79
-67
lines changed

trust-quorum/src/task.rs

Lines changed: 79 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -14,24 +14,17 @@ use slog::{Logger, debug, error, info, o};
1414
use sprockets_tls::keys::SprocketsConfig;
1515
use std::collections::BTreeSet;
1616
use std::net::SocketAddrV6;
17-
use std::time::Duration;
1817
use thiserror::Error;
1918
use tokio::sync::mpsc::error::SendError;
2019
use tokio::sync::oneshot::error::RecvError;
2120
use tokio::sync::{mpsc, oneshot};
22-
use tokio::time::sleep;
2321
use trust_quorum_protocol::{
2422
Alarm, BaseboardId, CommitError, Configuration, Epoch, ExpungedMetadata,
2523
LoadRackSecretError, LrtqUpgradeError, LrtqUpgradeMsg, Node, NodeCallerCtx,
2624
NodeCommonCtx, NodeCtx, PersistentState, PrepareAndCommitError,
2725
ReconfigurationError, ReconfigureMsg, ReconstructedRackSecret,
2826
};
2927

30-
#[cfg(not(test))]
31-
const LOAD_RACK_SECRET_RETRY_TIMEOUT: Duration = Duration::from_millis(500);
32-
#[cfg(test)]
33-
const LOAD_RACK_SECRET_RETRY_TIMEOUT: Duration = Duration::from_millis(5);
34-
3528
/// We only expect a handful of messages at a time.
3629
const API_CHANNEL_BOUND: usize = 32;
3730

@@ -245,20 +238,13 @@ impl NodeTaskHandle {
245238
pub async fn load_rack_secret(
246239
&self,
247240
epoch: Epoch,
248-
) -> Result<ReconstructedRackSecret, NodeApiError> {
249-
loop {
250-
let (tx, rx) = oneshot::channel();
251-
self.tx
252-
.send(NodeApiRequest::LoadRackSecret { epoch, responder: tx })
253-
.await?;
254-
if let Some(rack_secret) = rx.await?? {
255-
return Ok(rack_secret);
256-
};
257-
258-
// The task returns immediately with `None` if the secret is still
259-
// being loaded. We must therefore retry.
260-
sleep(LOAD_RACK_SECRET_RETRY_TIMEOUT).await;
261-
}
241+
) -> Result<Option<ReconstructedRackSecret>, NodeApiError> {
242+
let (tx, rx) = oneshot::channel();
243+
self.tx
244+
.send(NodeApiRequest::LoadRackSecret { epoch, responder: tx })
245+
.await?;
246+
let rs = rx.await??;
247+
Ok(rs)
262248
}
263249

264250
/// Return `Ok(true)` if the configuration has committed, `Ok(false)` if
@@ -542,7 +528,7 @@ mod tests {
542528
use camino::Utf8PathBuf;
543529
use dropshot::test_util::{LogContext, log_prefix_for_test};
544530
use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition};
545-
use omicron_test_utils::dev::test_setup_log;
531+
use omicron_test_utils::dev::{self, test_setup_log};
546532
use omicron_uuid_kinds::GenericUuid;
547533
use secrecy::ExposeSecretMut;
548534
use sled_hardware_types::Baseboard;
@@ -741,6 +727,36 @@ mod tests {
741727
self.logctx.cleanup_successful();
742728
std::fs::remove_dir_all(self.dir).unwrap();
743729
}
730+
731+
pub async fn wait_for_rack_secrets_and_assert_equality(
732+
&self,
733+
node_indexes: BTreeSet<usize>,
734+
epoch: Epoch,
735+
) -> Result<(), dev::poll::Error<NodeApiError>> {
736+
let poll_interval = Duration::from_millis(10);
737+
let poll_max = Duration::from_secs(10);
738+
wait_for_condition(
739+
async || {
740+
let mut secret = None;
741+
for (i, h) in self.node_handles.iter().enumerate() {
742+
if node_indexes.contains(&i) {
743+
let Some(rs) = h.load_rack_secret(epoch).await?
744+
else {
745+
return Err(CondCheckError::NotYet);
746+
};
747+
if secret.is_none() {
748+
secret = Some(rs.clone());
749+
}
750+
assert_eq!(&rs, secret.as_ref().unwrap());
751+
}
752+
}
753+
Ok(())
754+
},
755+
&poll_interval,
756+
&poll_max,
757+
)
758+
.await
759+
}
744760
}
745761

746762
/// Test that all nodes can connect to each other when given each the full
@@ -964,14 +980,13 @@ mod tests {
964980
.unwrap();
965981

966982
// Now load the rack secret at all nodes
967-
let mut secret = None;
968-
for h in &setup.node_handles {
969-
let rs = h.load_rack_secret(Epoch(1)).await.unwrap();
970-
if secret.is_none() {
971-
secret = Some(rs.clone());
972-
}
973-
assert_eq!(&rs, secret.as_ref().unwrap());
974-
}
983+
setup
984+
.wait_for_rack_secrets_and_assert_equality(
985+
(0..num_nodes).collect(),
986+
Epoch(1),
987+
)
988+
.await
989+
.unwrap();
975990

976991
setup.cleanup_successful();
977992
}
@@ -1109,14 +1124,13 @@ mod tests {
11091124
assert!(status.persistent_state.commits.contains(&Epoch(1)));
11101125

11111126
// Now load the rack secret at all nodes
1112-
let mut secret = None;
1113-
for h in &setup.node_handles {
1114-
let rs = h.load_rack_secret(Epoch(1)).await.unwrap();
1115-
if secret.is_none() {
1116-
secret = Some(rs.clone());
1117-
}
1118-
assert_eq!(&rs, secret.as_ref().unwrap());
1119-
}
1127+
setup
1128+
.wait_for_rack_secrets_and_assert_equality(
1129+
(0..num_nodes).collect(),
1130+
Epoch(1),
1131+
)
1132+
.await
1133+
.unwrap();
11201134

11211135
setup.cleanup_successful();
11221136
}
@@ -1202,14 +1216,13 @@ mod tests {
12021216
.unwrap();
12031217

12041218
// Now load the rack secret at all nodes
1205-
let mut secret = None;
1206-
for h in &setup.node_handles {
1207-
let rs = h.load_rack_secret(Epoch(1)).await.unwrap();
1208-
if secret.is_none() {
1209-
secret = Some(rs.clone());
1210-
}
1211-
assert_eq!(&rs, secret.as_ref().unwrap());
1212-
}
1219+
setup
1220+
.wait_for_rack_secrets_and_assert_equality(
1221+
(0..num_nodes).collect(),
1222+
Epoch(1),
1223+
)
1224+
.await
1225+
.unwrap();
12131226

12141227
// Tell all but the last node how to reach each other
12151228
// This should disconnect the last node from everybody
@@ -1332,22 +1345,22 @@ mod tests {
13321345

13331346
// Load the secret at epoch 1. This should trigger a `CommitAdvance`
13341347
// response from nodes that committed at epoch 2.
1335-
let res =
1336-
setup.node_handles.last().unwrap().load_rack_secret(Epoch(1)).await;
1337-
1338-
println!("res = {res:#?}");
1339-
1340-
let rs = last_node.load_rack_secret(Epoch(2)).await.unwrap();
1348+
setup
1349+
.wait_for_rack_secrets_and_assert_equality(
1350+
BTreeSet::from([num_nodes - 1]),
1351+
Epoch(1),
1352+
)
1353+
.await
1354+
.unwrap();
13411355

1342-
// Ensure the rack secret is the same as at another node
1343-
let expected = setup
1344-
.node_handles
1345-
.first()
1346-
.unwrap()
1347-
.load_rack_secret(Epoch(2))
1356+
// Ensure the rack secret at epoch 2 is the same as at another node
1357+
setup
1358+
.wait_for_rack_secrets_and_assert_equality(
1359+
BTreeSet::from([0, num_nodes - 1]),
1360+
Epoch(2),
1361+
)
13481362
.await
13491363
.unwrap();
1350-
assert_eq!(rs, expected);
13511364

13521365
setup.cleanup_successful();
13531366
}
@@ -1424,14 +1437,13 @@ mod tests {
14241437
.unwrap();
14251438

14261439
// Now load the rack secret at all nodes
1427-
let mut secret = None;
1428-
for h in &setup.node_handles {
1429-
let rs = h.load_rack_secret(Epoch(1)).await.unwrap();
1430-
if secret.is_none() {
1431-
secret = Some(rs.clone());
1432-
}
1433-
assert_eq!(&rs, secret.as_ref().unwrap());
1434-
}
1440+
setup
1441+
.wait_for_rack_secrets_and_assert_equality(
1442+
(0..num_nodes).collect(),
1443+
Epoch(1),
1444+
)
1445+
.await
1446+
.unwrap();
14351447

14361448
setup.cleanup_successful();
14371449
}

0 commit comments

Comments
 (0)