Skip to content

Commit a45a089

Browse files
authored
Region snapshot replacement for read-only regions (#7435)
Reuse the region snapshot replacement machinery to replace read-only regions. This is done by storing a replacement type in the region snapshot replacement record such that either a region snapshot _or_ a read-only region can be the subject of this type of replacement. The procedure for both types is the same so all the code can be reused. A future commit will rename region snapshot replacement (and all references) to "read-only target replacement" to reflect that the machinery now applies to both region snapshots and read-only regions. This will be a mostly mechanical set of changes that can be reviewed separately with much less scrutiny. Right now manually requesting a region replacement with omdb is done through the region replacement manual request, not the region snapshot replacement manual request. This will change in that future commit to be part of a read-only target replacement request. Fixes #6172
1 parent 798b276 commit a45a089

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+2130
-490
lines changed

dev-tools/omdb/src/bin/omdb/db.rs

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ use nexus_db_model::NetworkInterfaceKind;
7676
use nexus_db_model::PhysicalDisk;
7777
use nexus_db_model::Probe;
7878
use nexus_db_model::Project;
79+
use nexus_db_model::ReadOnlyTargetReplacement;
7980
use nexus_db_model::Region;
8081
use nexus_db_model::RegionReplacement;
8182
use nexus_db_model::RegionReplacementState;
@@ -3068,11 +3069,19 @@ async fn cmd_db_region_replacement_request(
30683069
) -> Result<(), anyhow::Error> {
30693070
let region = datastore.get_region(args.region_id).await?;
30703071

3071-
let request_id = datastore
3072-
.create_region_replacement_request_for_region(opctx, &region)
3073-
.await?;
3072+
if region.read_only() {
3073+
let request_id = datastore
3074+
.create_read_only_region_replacement_request(opctx, region.id())
3075+
.await?;
3076+
3077+
println!("region snapshot replacement {request_id} created");
3078+
} else {
3079+
let request_id = datastore
3080+
.create_region_replacement_request_for_region(opctx, &region)
3081+
.await?;
30743082

3075-
println!("region replacement {request_id} created");
3083+
println!("region replacement {request_id} created");
3084+
}
30763085

30773086
Ok(())
30783087
}
@@ -4448,12 +4457,22 @@ async fn cmd_db_region_snapshot_replacement_status(
44484457
" state: {:?}",
44494458
request.replacement_state
44504459
);
4451-
println!(
4452-
" region snapshot: {} {} {}",
4453-
request.old_dataset_id,
4454-
request.old_region_id,
4455-
request.old_snapshot_id,
4456-
);
4460+
match request.replacement_type() {
4461+
ReadOnlyTargetReplacement::RegionSnapshot {
4462+
dataset_id,
4463+
region_id,
4464+
snapshot_id,
4465+
} => {
4466+
println!(
4467+
" region snapshot: {} {} {}",
4468+
dataset_id, region_id, snapshot_id,
4469+
);
4470+
}
4471+
4472+
ReadOnlyTargetReplacement::ReadOnlyRegion { region_id } => {
4473+
println!(" read-only region: {}", region_id);
4474+
}
4475+
}
44574476
println!(" new region id: {:?}", request.new_region_id);
44584477
println!(" in-progress steps left: {:?}", steps_left);
44594478
println!();
@@ -4485,10 +4504,22 @@ async fn cmd_db_region_snapshot_replacement_info(
44854504

44864505
println!(" started: {}", request.request_time);
44874506
println!(" state: {:?}", request.replacement_state);
4488-
println!(
4489-
" region snapshot: {} {} {}",
4490-
request.old_dataset_id, request.old_region_id, request.old_snapshot_id,
4491-
);
4507+
match request.replacement_type() {
4508+
ReadOnlyTargetReplacement::RegionSnapshot {
4509+
dataset_id,
4510+
region_id,
4511+
snapshot_id,
4512+
} => {
4513+
println!(
4514+
" region snapshot: {} {} {}",
4515+
dataset_id, region_id, snapshot_id,
4516+
);
4517+
}
4518+
4519+
ReadOnlyTargetReplacement::ReadOnlyRegion { region_id } => {
4520+
println!(" read-only region: {}", region_id);
4521+
}
4522+
}
44924523
println!(" new region id: {:?}", request.new_region_id);
44934524
println!(" in-progress steps left: {:?}", steps_left);
44944525
println!();

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ use nexus_types::internal_api::background::BlueprintRendezvousStatus;
4646
use nexus_types::internal_api::background::InstanceReincarnationStatus;
4747
use nexus_types::internal_api::background::InstanceUpdaterStatus;
4848
use nexus_types::internal_api::background::LookupRegionPortStatus;
49+
use nexus_types::internal_api::background::ReadOnlyRegionReplacementStartStatus;
4950
use nexus_types::internal_api::background::RegionReplacementDriverStatus;
5051
use nexus_types::internal_api::background::RegionReplacementStatus;
5152
use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus;
@@ -928,6 +929,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
928929
"phantom_disks" => {
929930
print_task_phantom_disks(details);
930931
}
932+
"read_only_region_replacement_start" => {
933+
print_task_read_only_region_replacement_start(details);
934+
}
931935
"region_replacement" => {
932936
print_task_region_replacement(details);
933937
}
@@ -1724,6 +1728,32 @@ fn print_task_phantom_disks(details: &serde_json::Value) {
17241728
};
17251729
}
17261730

1731+
fn print_task_read_only_region_replacement_start(details: &serde_json::Value) {
1732+
match serde_json::from_value::<ReadOnlyRegionReplacementStartStatus>(
1733+
details.clone(),
1734+
) {
1735+
Err(error) => eprintln!(
1736+
"warning: failed to interpret task details: {:?}: {:?}",
1737+
error, details
1738+
),
1739+
1740+
Ok(status) => {
1741+
println!(
1742+
" total requests created ok: {}",
1743+
status.requests_created_ok.len(),
1744+
);
1745+
for line in &status.requests_created_ok {
1746+
println!(" > {line}");
1747+
}
1748+
1749+
println!(" errors: {}", status.errors.len());
1750+
for line in &status.errors {
1751+
println!(" > {line}");
1752+
}
1753+
}
1754+
}
1755+
}
1756+
17271757
fn print_task_region_replacement(details: &serde_json::Value) {
17281758
match serde_json::from_value::<RegionReplacementStatus>(details.clone()) {
17291759
Err(error) => eprintln!(

dev-tools/omdb/tests/env.out

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,10 @@ task: "physical_disk_adoption"
129129
ensure new physical disks are automatically marked in-service
130130

131131

132+
task: "read_only_region_replacement_start"
133+
detect if read-only regions need replacement and begin the process
134+
135+
132136
task: "region_replacement"
133137
detects if a region requires replacing and begins the process
134138

@@ -313,6 +317,10 @@ task: "physical_disk_adoption"
313317
ensure new physical disks are automatically marked in-service
314318

315319

320+
task: "read_only_region_replacement_start"
321+
detect if read-only regions need replacement and begin the process
322+
323+
316324
task: "region_replacement"
317325
detects if a region requires replacing and begins the process
318326

@@ -484,6 +492,10 @@ task: "physical_disk_adoption"
484492
ensure new physical disks are automatically marked in-service
485493

486494

495+
task: "read_only_region_replacement_start"
496+
detect if read-only regions need replacement and begin the process
497+
498+
487499
task: "region_replacement"
488500
detects if a region requires replacing and begins the process
489501

dev-tools/omdb/tests/successes.out

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,10 @@ task: "physical_disk_adoption"
337337
ensure new physical disks are automatically marked in-service
338338

339339

340+
task: "read_only_region_replacement_start"
341+
detect if read-only regions need replacement and begin the process
342+
343+
340344
task: "region_replacement"
341345
detects if a region requires replacing and begins the process
342346

@@ -613,6 +617,14 @@ task: "physical_disk_adoption"
613617
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
614618
last completion reported error: task disabled
615619

620+
task: "read_only_region_replacement_start"
621+
configured period: every <REDACTED_DURATION>m
622+
currently executing: no
623+
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
624+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
625+
total requests created ok: 0
626+
errors: 0
627+
616628
task: "region_replacement"
617629
configured period: every <REDACTED_DURATION>m
618630
currently executing: no
@@ -1104,6 +1116,14 @@ task: "physical_disk_adoption"
11041116
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
11051117
last completion reported error: task disabled
11061118

1119+
task: "read_only_region_replacement_start"
1120+
configured period: every <REDACTED_DURATION>m
1121+
currently executing: no
1122+
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
1123+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
1124+
total requests created ok: 0
1125+
errors: 0
1126+
11071127
task: "region_replacement"
11081128
configured period: every <REDACTED_DURATION>m
11091129
currently executing: no

nexus-config/src/nexus_config.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,9 @@ pub struct BackgroundTaskConfig {
419419
RegionSnapshotReplacementFinishConfig,
420420
/// configuration for TUF artifact replication task
421421
pub tuf_artifact_replication: TufArtifactReplicationConfig,
422+
/// configuration for read-only region replacement start task
423+
pub read_only_region_replacement_start:
424+
ReadOnlyRegionReplacementStartConfig,
422425
}
423426

424427
#[serde_as]
@@ -735,6 +738,14 @@ pub struct TufArtifactReplicationConfig {
735738
pub min_sled_replication: usize,
736739
}
737740

741+
#[serde_as]
742+
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
743+
pub struct ReadOnlyRegionReplacementStartConfig {
744+
/// period (in seconds) for periodic activations of this background task
745+
#[serde_as(as = "DurationSeconds<u64>")]
746+
pub period_secs: Duration,
747+
}
748+
738749
/// Configuration for a nexus server
739750
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
740751
pub struct PackageConfig {
@@ -993,6 +1004,7 @@ mod test {
9931004
region_snapshot_replacement_finish.period_secs = 30
9941005
tuf_artifact_replication.period_secs = 300
9951006
tuf_artifact_replication.min_sled_replication = 3
1007+
read_only_region_replacement_start.period_secs = 30
9961008
[default_region_allocation_strategy]
9971009
type = "random"
9981010
seed = 0
@@ -1194,6 +1206,10 @@ mod test {
11941206
period_secs: Duration::from_secs(300),
11951207
min_sled_replication: 3,
11961208
},
1209+
read_only_region_replacement_start:
1210+
ReadOnlyRegionReplacementStartConfig {
1211+
period_secs: Duration::from_secs(30),
1212+
},
11971213
},
11981214
default_region_allocation_strategy:
11991215
crate::nexus_config::RegionAllocationStrategy::Random {
@@ -1279,6 +1295,7 @@ mod test {
12791295
region_snapshot_replacement_finish.period_secs = 30
12801296
tuf_artifact_replication.period_secs = 300
12811297
tuf_artifact_replication.min_sled_replication = 3
1298+
read_only_region_replacement_start.period_secs = 30
12821299
[default_region_allocation_strategy]
12831300
type = "random"
12841301
"##,

0 commit comments

Comments
 (0)