Skip to content

Commit c009adb

Browse files
authored
Push probe zones to sled-agent instead of pulling them from Nexus (#9353)
- Add APIs in the sled-agent for creating / deleting probes, and have Nexus use them when managing probes from the external API, especially replacing the entire set of probes with a PUT. - Rework the probe manager to accept the list of expected probes from Nexus, and drive the state toward that, rather than periodically pollling Nexus. - Add background task for periodically pushing probes to sleds, and omdb innards for reporting its state. - Closes #9157
1 parent 163db2f commit c009adb

File tree

33 files changed

+9363
-289
lines changed

33 files changed

+9363
-289
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ use nexus_types::internal_api::background::InstanceReincarnationStatus;
6060
use nexus_types::internal_api::background::InstanceUpdaterStatus;
6161
use nexus_types::internal_api::background::InventoryLoadStatus;
6262
use nexus_types::internal_api::background::LookupRegionPortStatus;
63+
use nexus_types::internal_api::background::ProbeDistributorStatus;
6364
use nexus_types::internal_api::background::ReadOnlyRegionReplacementStartStatus;
6465
use nexus_types::internal_api::background::RegionReplacementDriverStatus;
6566
use nexus_types::internal_api::background::RegionReplacementStatus;
@@ -1193,6 +1194,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
11931194
"phantom_disks" => {
11941195
print_task_phantom_disks(details);
11951196
}
1197+
"probe_distributor" => {
1198+
print_task_probe_distributor(details);
1199+
}
11961200
"read_only_region_replacement_start" => {
11971201
print_task_read_only_region_replacement_start(details);
11981202
}
@@ -2131,6 +2135,32 @@ fn print_task_phantom_disks(details: &serde_json::Value) {
21312135
};
21322136
}
21332137

2138+
fn print_task_probe_distributor(details: &serde_json::Value) {
2139+
match serde_json::from_value::<ProbeDistributorStatus>(details.clone()) {
2140+
Err(error) => eprintln!(
2141+
"warning: failed to interpret task details: {:?}: {:?}",
2142+
error, details
2143+
),
2144+
Ok(ProbeDistributorStatus { probes_by_sled, errors }) => {
2145+
let n_total_probes: usize = probes_by_sled.values().sum();
2146+
println!(" succesfully-pushed probes: {} total", n_total_probes);
2147+
for (sled_id, count) in probes_by_sled {
2148+
if count == 0 {
2149+
continue;
2150+
}
2151+
println!(" sled_id={} n_probes={}", sled_id, count);
2152+
}
2153+
println!(" errors while pushing probes: {} total", errors.len());
2154+
for err in errors {
2155+
println!(
2156+
" sled_id={} sled_ip={} error={}",
2157+
err.sled_id, err.sled_ip, err.error,
2158+
);
2159+
}
2160+
}
2161+
};
2162+
}
2163+
21342164
fn print_task_read_only_region_replacement_start(details: &serde_json::Value) {
21352165
match serde_json::from_value::<ReadOnlyRegionReplacementStartStatus>(
21362166
details.clone(),

dev-tools/omdb/tests/env.out

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,10 @@ task: "physical_disk_adoption"
149149
ensure new physical disks are automatically marked in-service
150150

151151

152+
task: "probe_distributor"
153+
distributes networking probe zones to sleds
154+
155+
152156
task: "read_only_region_replacement_start"
153157
detect if read-only regions need replacement and begin the process
154158

@@ -373,6 +377,10 @@ task: "physical_disk_adoption"
373377
ensure new physical disks are automatically marked in-service
374378

375379

380+
task: "probe_distributor"
381+
distributes networking probe zones to sleds
382+
383+
376384
task: "read_only_region_replacement_start"
377385
detect if read-only regions need replacement and begin the process
378386

@@ -584,6 +592,10 @@ task: "physical_disk_adoption"
584592
ensure new physical disks are automatically marked in-service
585593

586594

595+
task: "probe_distributor"
596+
distributes networking probe zones to sleds
597+
598+
587599
task: "read_only_region_replacement_start"
588600
detect if read-only regions need replacement and begin the process
589601

dev-tools/omdb/tests/successes.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,10 @@ task: "physical_disk_adoption"
384384
ensure new physical disks are automatically marked in-service
385385

386386

387+
task: "probe_distributor"
388+
distributes networking probe zones to sleds
389+
390+
387391
task: "read_only_region_replacement_start"
388392
detect if read-only regions need replacement and begin the process
389393

@@ -713,6 +717,13 @@ task: "physical_disk_adoption"
713717
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
714718
last completion reported error: task disabled
715719

720+
task: "probe_distributor"
721+
configured period: every <REDACTED_DURATION>m
722+
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
723+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
724+
succesfully-pushed probes: 0 total
725+
errors while pushing probes: 0 total
726+
716727
task: "read_only_region_replacement_start"
717728
configured period: every <REDACTED_DURATION>days <REDACTED_DURATION>h <REDACTED_DURATION>m <REDACTED_DURATION>s
718729
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
@@ -1267,6 +1278,13 @@ task: "physical_disk_adoption"
12671278
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
12681279
last completion reported error: task disabled
12691280

1281+
task: "probe_distributor"
1282+
configured period: every <REDACTED_DURATION>m
1283+
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
1284+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
1285+
succesfully-pushed probes: 0 total
1286+
errors while pushing probes: 0 total
1287+
12701288
task: "read_only_region_replacement_start"
12711289
configured period: every <REDACTED_DURATION>days <REDACTED_DURATION>h <REDACTED_DURATION>m <REDACTED_DURATION>s
12721290
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>

illumos-utils/src/opte/port.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ impl Drop for PortInner {
5757
Err(e) => e,
5858
};
5959
eprintln!(
60-
"WARNING: Failed to delete the xde device. It must be deleted
60+
"WARNING: Failed to delete the xde device. It must be deleted \
6161
out of band, and it will not be possible to recreate the xde \
6262
device until then. Error: {:?}",
6363
err,

nexus-config/src/nexus_config.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,8 @@ pub struct BackgroundTaskConfig {
443443
pub sp_ereport_ingester: SpEreportIngesterConfig,
444444
/// configuration for fault management background tasks
445445
pub fm: FmTasksConfig,
446+
/// configuration for networking probe distributor
447+
pub probe_distributor: ProbeDistributorConfig,
446448
}
447449

448450
#[serde_as]
@@ -897,6 +899,15 @@ impl Default for FmTasksConfig {
897899
}
898900
}
899901

902+
#[serde_as]
903+
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
904+
pub struct ProbeDistributorConfig {
905+
/// period (in seconds) for periodic activations of the background task that
906+
/// distributes networking probe zones to sled-agents.
907+
#[serde_as(as = "DurationSeconds<u64>")]
908+
pub period_secs: Duration,
909+
}
910+
900911
/// Configuration for a nexus server
901912
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
902913
pub struct PackageConfig {
@@ -1201,6 +1212,7 @@ mod test {
12011212
sp_ereport_ingester.period_secs = 47
12021213
fm.sitrep_load_period_secs = 48
12031214
fm.sitrep_gc_period_secs = 49
1215+
probe_distributor.period_secs = 50
12041216
[default_region_allocation_strategy]
12051217
type = "random"
12061218
seed = 0
@@ -1448,7 +1460,10 @@ mod test {
14481460
fm: FmTasksConfig {
14491461
sitrep_load_period_secs: Duration::from_secs(48),
14501462
sitrep_gc_period_secs: Duration::from_secs(49),
1451-
}
1463+
},
1464+
probe_distributor: ProbeDistributorConfig {
1465+
period_secs: Duration::from_secs(50),
1466+
},
14521467
},
14531468
default_region_allocation_strategy:
14541469
crate::nexus_config::RegionAllocationStrategy::Random {
@@ -1549,6 +1564,7 @@ mod test {
15491564
sp_ereport_ingester.period_secs = 44
15501565
fm.sitrep_load_period_secs = 45
15511566
fm.sitrep_gc_period_secs = 46
1567+
probe_distributor.period_secs = 47
15521568
15531569
[default_region_allocation_strategy]
15541570
type = "random"

nexus/background-task-interface/src/init.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ pub struct BackgroundTasks {
5353
pub task_reconfigurator_config_loader: Activator,
5454
pub task_fm_sitrep_loader: Activator,
5555
pub task_fm_sitrep_gc: Activator,
56+
pub task_probe_distributor: Activator,
5657

5758
// Handles to activate background tasks that do not get used by Nexus
5859
// at-large. These background tasks are implementation details as far as

0 commit comments

Comments
 (0)