Skip to content

Commit 017597c

Browse files
hawkwsmklein
andauthored
[nexus] garbage collect orphaned FM sitreps (#9335)
When a Nexus attempts to commit a new fault management situation report to the sitrep history but fails to do so because another sitrep with the same parent has already been inserted, that sitrep is said to be _orphaned_. Records pertaining to it are left behind in the database, but it will not be accessed by the rest of the system. Thus, we must occasionally garbage-collect such sitreps. This branch adds a background task for doing so. Depends on #9320 Co-authored-by: Sean Klein <sean@oxide.computer>
1 parent f9bc2c5 commit 017597c

File tree

22 files changed

+957
-48
lines changed

22 files changed

+957
-48
lines changed

dev-tools/omdb/src/bin/omdb/db.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1309,7 +1309,7 @@ impl DbArgs {
13091309
sitrep::cmd_db_sitrep(&opctx, &datastore, &fetch_opts, args).await
13101310
}
13111311
DbCommands::Sitreps(args) => {
1312-
sitrep::cmd_db_sitrep_history(&datastore, &fetch_opts, args).await
1312+
sitrep::cmd_db_sitrep_history(&opctx, &datastore, &fetch_opts, args).await
13131313
}
13141314
DbCommands::Sleds(args) => {
13151315
cmd_db_sleds(&opctx, &datastore, &fetch_opts, args).await

dev-tools/omdb/src/bin/omdb/db/sitrep.rs

Lines changed: 31 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
use crate::db::DbFetchOptions;
88
use crate::db::check_limit;
99
use crate::helpers::const_max_len;
10-
use crate::helpers::datetime_rfc3339_concise;
10+
use crate::helpers::datetime_opt_rfc3339_concise;
1111
use anyhow::Context;
1212
use async_bb8_diesel::AsyncRunQueryDsl;
1313
use chrono::{DateTime, Utc};
@@ -17,7 +17,6 @@ use diesel::prelude::*;
1717
use nexus_db_queries::context::OpContext;
1818
use nexus_db_queries::db::DataStore;
1919
use nexus_db_queries::db::model;
20-
use nexus_db_queries::db::pagination::paginated;
2120
use nexus_types::fm;
2221
use omicron_common::api::external::DataPageParams;
2322
use omicron_common::api::external::PaginationOrder;
@@ -26,7 +25,6 @@ use omicron_uuid_kinds::SitrepUuid;
2625
use tabled::Tabled;
2726
use uuid::Uuid;
2827

29-
use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl;
3028
use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl;
3129
use nexus_db_schema::schema::inv_collection::dsl as inv_collection_dsl;
3230

@@ -100,7 +98,7 @@ pub(super) async fn cmd_db_sitrep(
10098
) -> anyhow::Result<()> {
10199
match args.command {
102100
Commands::History(ref args) => {
103-
cmd_db_sitrep_history(datastore, fetch_opts, args).await
101+
cmd_db_sitrep_history(opctx, datastore, fetch_opts, args).await
104102
}
105103
Commands::Info { sitrep, ref args } => {
106104
cmd_db_sitrep_show(opctx, datastore, fetch_opts, args, sitrep).await
@@ -119,6 +117,7 @@ pub(super) async fn cmd_db_sitrep(
119117
}
120118

121119
pub(super) async fn cmd_db_sitrep_history(
120+
opctx: &OpContext,
122121
datastore: &DataStore,
123122
fetch_opts: &DbFetchOptions,
124123
args: &SitrepHistoryArgs,
@@ -138,53 +137,48 @@ pub(super) async fn cmd_db_sitrep_history(
138137
struct SitrepRow {
139138
v: u32,
140139
id: Uuid,
141-
#[tabled(display_with = "datetime_rfc3339_concise")]
142-
created_at: DateTime<Utc>,
140+
#[tabled(display_with = "datetime_opt_rfc3339_concise")]
141+
created_at: Option<DateTime<Utc>>,
143142
comment: String,
144143
}
145144

146-
let conn = datastore.pool_connection_for_tests().await?;
147145
let marker = args.from.map(model::SqlU32::new);
148146
let pagparams = DataPageParams {
149147
marker: marker.as_ref(),
150148
direction: PaginationOrder::Descending,
151149
limit: fetch_opts.fetch_limit,
152150
};
153-
let sitreps: Vec<(model::SitrepVersion, model::SitrepMetadata)> =
154-
paginated(
155-
history_dsl::fm_sitrep_history,
156-
history_dsl::version,
157-
&pagparams,
158-
)
159-
.inner_join(
160-
sitrep_dsl::fm_sitrep.on(history_dsl::sitrep_id.eq(sitrep_dsl::id)),
161-
)
162-
.select((
163-
model::SitrepVersion::as_select(),
164-
model::SitrepMetadata::as_select(),
165-
))
166-
.load_async(&*conn)
151+
let versions = datastore
152+
.fm_sitrep_version_list(&opctx, &pagparams)
167153
.await
168154
.with_context(ctx)?;
169155

170-
check_limit(&sitreps, fetch_opts.fetch_limit, ctx);
171-
172-
let rows = sitreps.into_iter().map(|(version, metadata)| {
173-
let model::SitrepMetadata {
174-
id,
175-
time_created,
176-
comment,
177-
creator_id: _,
178-
parent_sitrep_id: _,
179-
inv_collection_id: _,
180-
} = metadata;
181-
SitrepRow {
182-
v: version.version.into(),
183-
id: id.into_untyped_uuid(),
156+
check_limit(&versions, fetch_opts.fetch_limit, ctx);
157+
158+
let mut rows = Vec::with_capacity(versions.len());
159+
for v in versions {
160+
let (comment, time_created) =
161+
match datastore.fm_sitrep_metadata_read(&opctx, v.id).await {
162+
Ok(s) => (s.comment, Some(s.time_created)),
163+
Err(e) => {
164+
// If the sitrep has an entry in the history table, we
165+
// expect that it will not yet have been archived and
166+
// deleted, so this is an error rather a case of it just
167+
// no longer existing.
168+
eprintln!(
169+
"failed to fetch metadata for sitrep {} (v{}): {e}",
170+
v.id, v.version
171+
);
172+
("<ERROR>".to_string(), None)
173+
}
174+
};
175+
rows.push(SitrepRow {
176+
v: v.version,
177+
id: v.id.into_untyped_uuid(),
184178
created_at: time_created,
185179
comment,
186-
}
187-
});
180+
});
181+
}
188182

189183
let table = tabled::Table::new(rows)
190184
.with(tabled::settings::Style::empty())

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus
6767
use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus;
6868
use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus;
6969
use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus;
70+
use nexus_types::internal_api::background::SitrepGcStatus;
7071
use nexus_types::internal_api::background::SitrepLoadStatus;
7172
use nexus_types::internal_api::background::SupportBundleCleanupReport;
7273
use nexus_types::internal_api::background::SupportBundleCollectionReport;
@@ -1240,6 +1241,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
12401241
"fm_sitrep_loader" => {
12411242
print_task_fm_sitrep_loader(details);
12421243
}
1244+
"fm_sitrep_gc" => {
1245+
print_task_fm_sitrep_gc(details);
1246+
}
12431247
_ => {
12441248
println!(
12451249
"warning: unknown background task: {:?} \
@@ -3142,6 +3146,41 @@ fn print_task_fm_sitrep_loader(details: &serde_json::Value) {
31423146
};
31433147
}
31443148

3149+
fn print_task_fm_sitrep_gc(details: &serde_json::Value) {
3150+
let SitrepGcStatus {
3151+
orphaned_sitreps_found,
3152+
orphaned_sitreps_deleted,
3153+
errors,
3154+
} = match serde_json::from_value::<SitrepGcStatus>(details.clone()) {
3155+
Err(error) => {
3156+
eprintln!(
3157+
"warning: failed to interpret task details: {:?}: {:?}",
3158+
error, details
3159+
);
3160+
return;
3161+
}
3162+
Ok(status) => status,
3163+
};
3164+
3165+
pub const ORPHANS_FOUND: &str = "orphaned sitreps found:";
3166+
pub const ORPHANS_DELETED: &str = "orphaned sitreps deleted:";
3167+
pub const ERRORS: &str = "errors:";
3168+
pub const WIDTH: usize =
3169+
const_max_len(&[ERRORS, ORPHANS_FOUND, ORPHANS_DELETED]) + 1;
3170+
pub const NUM_WIDTH: usize = 4;
3171+
if !errors.is_empty() {
3172+
println!("{ERRICON} {ERRORS:<WIDTH$}{:>NUM_WIDTH$}", errors.len());
3173+
for error in errors {
3174+
println!(" > {error}")
3175+
}
3176+
}
3177+
3178+
println!(" {ORPHANS_FOUND:<WIDTH$}{orphaned_sitreps_found:>NUM_WIDTH$}");
3179+
println!(
3180+
" {ORPHANS_DELETED:<WIDTH$}{orphaned_sitreps_deleted:>NUM_WIDTH$}"
3181+
);
3182+
}
3183+
31453184
const ERRICON: &str = "/!\\";
31463185

31473186
fn warn_if_nonzero(n: usize) -> &'static str {

dev-tools/omdb/tests/env.out

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,10 @@ task: "external_endpoints"
9999
on each one
100100

101101

102+
task: "fm_sitrep_gc"
103+
garbage collects fault management situation reports
104+
105+
102106
task: "fm_sitrep_loader"
103107
loads the current fault management situation report from the database
104108

@@ -319,6 +323,10 @@ task: "external_endpoints"
319323
on each one
320324

321325

326+
task: "fm_sitrep_gc"
327+
garbage collects fault management situation reports
328+
329+
322330
task: "fm_sitrep_loader"
323331
loads the current fault management situation report from the database
324332

@@ -526,6 +534,10 @@ task: "external_endpoints"
526534
on each one
527535

528536

537+
task: "fm_sitrep_gc"
538+
garbage collects fault management situation reports
539+
540+
529541
task: "fm_sitrep_loader"
530542
loads the current fault management situation report from the database
531543

dev-tools/omdb/tests/successes.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,10 @@ task: "external_endpoints"
334334
on each one
335335

336336

337+
task: "fm_sitrep_gc"
338+
garbage collects fault management situation reports
339+
340+
337341
task: "fm_sitrep_loader"
338342
loads the current fault management situation report from the database
339343

@@ -619,6 +623,13 @@ task: "external_endpoints"
619623

620624
TLS certificates: 0
621625

626+
task: "fm_sitrep_gc"
627+
configured period: every <REDACTED_DURATION>s
628+
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
629+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
630+
orphaned sitreps found: 0
631+
orphaned sitreps deleted: 0
632+
622633
task: "fm_sitrep_loader"
623634
configured period: every <REDACTED_DURATION>s
624635
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
@@ -1166,6 +1177,13 @@ task: "external_endpoints"
11661177

11671178
TLS certificates: 0
11681179

1180+
task: "fm_sitrep_gc"
1181+
configured period: every <REDACTED_DURATION>s
1182+
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
1183+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
1184+
orphaned sitreps found: 0
1185+
orphaned sitreps deleted: 0
1186+
11691187
task: "fm_sitrep_loader"
11701188
configured period: every <REDACTED_DURATION>s
11711189
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>

nexus-config/src/nexus_config.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -879,11 +879,21 @@ pub struct FmTasksConfig {
879879
/// reads the latest fault management sitrep from the database.
880880
#[serde_as(as = "DurationSeconds<u64>")]
881881
pub sitrep_load_period_secs: Duration,
882+
/// period (in seconds) for periodic activations of the background task that
883+
/// garbage collects unneeded fault management sitreps in the database.
884+
#[serde_as(as = "DurationSeconds<u64>")]
885+
pub sitrep_gc_period_secs: Duration,
882886
}
883887

884888
impl Default for FmTasksConfig {
885889
fn default() -> Self {
886-
Self { sitrep_load_period_secs: Duration::from_secs(15) }
890+
Self {
891+
sitrep_load_period_secs: Duration::from_secs(15),
892+
// This need not be activated very frequently, as it's triggered any
893+
// time the current sitrep changes, and activating it more
894+
// frequently won't make things more responsive.
895+
sitrep_gc_period_secs: Duration::from_secs(600),
896+
}
887897
}
888898
}
889899

@@ -1190,6 +1200,7 @@ mod test {
11901200
webhook_deliverator.second_retry_backoff_secs = 46
11911201
sp_ereport_ingester.period_secs = 47
11921202
fm.sitrep_load_period_secs = 48
1203+
fm.sitrep_gc_period_secs = 49
11931204
[default_region_allocation_strategy]
11941205
type = "random"
11951206
seed = 0
@@ -1436,6 +1447,7 @@ mod test {
14361447
},
14371448
fm: FmTasksConfig {
14381449
sitrep_load_period_secs: Duration::from_secs(48),
1450+
sitrep_gc_period_secs: Duration::from_secs(49),
14391451
}
14401452
},
14411453
default_region_allocation_strategy:
@@ -1536,6 +1548,7 @@ mod test {
15361548
webhook_deliverator.period_secs = 43
15371549
sp_ereport_ingester.period_secs = 44
15381550
fm.sitrep_load_period_secs = 45
1551+
fm.sitrep_gc_period_secs = 46
15391552
15401553
[default_region_allocation_strategy]
15411554
type = "random"

nexus/background-task-interface/src/init.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ pub struct BackgroundTasks {
5252
pub task_sp_ereport_ingester: Activator,
5353
pub task_reconfigurator_config_loader: Activator,
5454
pub task_fm_sitrep_loader: Activator,
55+
pub task_fm_sitrep_gc: Activator,
5556

5657
// Handles to activate background tasks that do not get used by Nexus
5758
// at-large. These background tasks are implementation details as far as

nexus/db-model/src/schema_versions.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
1616
///
1717
/// This must be updated when you change the database schema. Refer to
1818
/// schema/crdb/README.adoc in the root of this repository for details.
19-
pub const SCHEMA_VERSION: Version = Version::new(205, 0, 0);
19+
pub const SCHEMA_VERSION: Version = Version::new(206, 0, 0);
2020

2121
/// List of all past database schema versions, in *reverse* order
2222
///
@@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
2828
// | leaving the first copy as an example for the next person.
2929
// v
3030
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
31+
KnownVersion::new(206, "fm-sitreps-by-parent-id-index"),
3132
KnownVersion::new(205, "fm-sitrep"),
3233
KnownVersion::new(204, "local-storage-dataset"),
3334
KnownVersion::new(203, "scim-actor-audit-log"),

0 commit comments

Comments
 (0)