Skip to content

Commit a9f222f

Browse files
committed
reticulating
1 parent 9631682 commit a9f222f

File tree

1 file changed

+153
-25
lines changed

1 file changed

+153
-25
lines changed

nexus/fm/src/de/power_shelf.rs

Lines changed: 153 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,42 @@ use crate::alert;
1111
use crate::ereport_analysis;
1212
use nexus_types::fm::DiagnosisEngineKind;
1313
use nexus_types::fm::Ereport;
14+
use nexus_types::fm::case::CaseEreport;
15+
use nexus_types::fm::case::ImpactedSpSlot;
1416
use nexus_types::fm::ereport;
1517
use nexus_types::inventory::SpType;
18+
use omicron_uuid_kinds::CaseUuid;
1619
use serde::de::DeserializeOwned;
1720
use serde_json::Value;
21+
use std::collections::HashMap;
1822
use std::sync::Arc;
1923

2024
pub struct PowerShelfDiagnosis {
2125
log: slog::Logger,
22-
// TODO(eliza): does this need/want any internal state?
26+
cases_by_shelf: [HashMap<CaseUuid, PscCase>; 2],
2327
}
2428

29+
#[derive(Default)]
30+
struct PscCase {
31+
psus_impacted: PsuSet,
32+
}
33+
34+
type PsuSet = [bool; N_PSUS];
35+
const N_PSUS: usize = 6;
36+
37+
const KNOWN_EREPORT_CLASSES: &[&str] = &[
38+
"hw.remove.psu",
39+
"hw.insert.psu",
40+
"hw.pwr.pwr_good.good",
41+
"hw.pwr.pwr_good.bad",
42+
];
43+
2544
impl PowerShelfDiagnosis {
2645
pub fn new(log: &slog::Logger) -> Self {
27-
Self { log: log.new(slog::o!("de" => "power_shelf")) }
46+
Self {
47+
log: log.new(slog::o!("de" => "power_shelf")),
48+
cases_by_shelf: [HashMap::new(), HashMap::new()],
49+
}
2850
}
2951
}
3052

@@ -38,7 +60,126 @@ impl DiagnosisEngine for PowerShelfDiagnosis {
3860
sitrep: &mut SitrepBuilder<'_>,
3961
case: &mut CaseBuilder,
4062
) -> anyhow::Result<()> {
41-
todo!()
63+
slog::debug!(
64+
self.log,
65+
"analyzing open case from parent sitrep...";
66+
"case_id" => %case.id
67+
);
68+
69+
// ooh, a case we alerady opened! let's figure out what its deal is...
70+
for &ImpactedSpSlot { sp_type, slot, ref comment, .. } in
71+
&case.impacted_sp_slots
72+
{
73+
// skip non-PSC impacts
74+
if sp_type != SpType::Power {
75+
continue;
76+
}
77+
78+
if matches!(slot, 0 | 1) {
79+
slog::debug!(
80+
&self.log,
81+
"open case impacts power shelf {slot}";
82+
"case_id" => %case.id,
83+
"power_shelf" => slot,
84+
"comment" => %comment,
85+
);
86+
// make sure it's tracked.
87+
self.cases_by_shelf[slot as usize].entry(case.id).or_default();
88+
} else {
89+
slog::warn!(
90+
&self.log,
91+
"this is weird: I only know about power shelves numbered \
92+
1 and 0, but found a case that claims to impact power \
93+
shelf {slot}";
94+
"case_id" => %case.id,
95+
"power_shelf" => slot,
96+
"comment" => %comment,
97+
);
98+
}
99+
}
100+
101+
for CaseEreport { ereport, comment, assigned_sitrep_id } in
102+
&case.ereports
103+
{
104+
let class = match &ereport.class {
105+
// This is one we care about
106+
Some(ref class)
107+
if KNOWN_EREPORT_CLASSES.contains(&class.as_ref()) =>
108+
{
109+
slog::debug!(
110+
self.log,
111+
"analyzing ereport assigned to open case...";
112+
"case_id" => %case.id,
113+
"ereport_id" => %ereport.id,
114+
"ereport_class" => %class,
115+
"comment" => %comment,
116+
"assigned_sitrep_id" => %assigned_sitrep_id,
117+
);
118+
class
119+
}
120+
class => {
121+
slog::debug!(
122+
&self.log,
123+
"an ereport with an unknown or missing class was \
124+
assigned to this case (presumably by another DE); \
125+
skipping it...";
126+
"case_id" => %case.id,
127+
"ereport_id" => %ereport.id,
128+
"ereport_class" => ?class,
129+
"comment" => %comment,
130+
);
131+
continue;
132+
}
133+
};
134+
135+
let ereport::Reporter::Sp { sp_type: SpType::Power, slot: shelf } =
136+
ereport.reporter
137+
else {
138+
slog::debug!(
139+
self.log,
140+
"skipping ereport that was not reported by a power shelf";
141+
"case_id" => %case.id,
142+
"ereport_id" => %ereport.id,
143+
"ereport_class" => %class,
144+
"ereport_id" => %ereport.id,
145+
"reporter" => %ereport.reporter,
146+
);
147+
continue;
148+
};
149+
150+
let tracked_case =
151+
self.cases_by_shelf[shelf as usize].entry(case.id).or_default();
152+
153+
// Does the ereport include a PSU slot?
154+
if let Some(slot) = ereport.report["slot"].as_u64() {
155+
let slot = slot as usize;
156+
if slot >= N_PSUS {
157+
slog::warn!(
158+
&self.log,
159+
"this is weird: I only know about power shelves with \
160+
{N_PSUS} PSU SLOTS, but this ereport claims to \
161+
involve slot {slot}";
162+
"case_id" => %case.id,
163+
"ereport_id" => %ereport.id,
164+
"ereport_class" => %class,
165+
"slot" => slot,
166+
)
167+
} else {
168+
slog::debug!(
169+
&self.log,
170+
"found an ereport associated with PSU slot {slot}";
171+
"case_id" => %case.id,
172+
"ereport_id" => %ereport.id,
173+
"ereport_class" => %class,
174+
"shelf" => shelf,
175+
"slot" => slot,
176+
);
177+
tracked_case.psus_impacted[slot] = true;
178+
}
179+
}
180+
}
181+
182+
Ok(())
42183
}
43184

44185
fn analyze_ereport(
@@ -204,6 +345,9 @@ fn grab_json_value<T: DeserializeOwned>(
204345
struct PscEreport {
205346
#[serde(flatten)]
206347
metadata: ereport_analysis::HubrisMetadata,
348+
349+
#[serde(flatten)]
350+
psu: PsuId,
207351
#[serde(flatten)]
208352
class: EreportClass,
209353
}
@@ -212,37 +356,21 @@ struct PscEreport {
212356
#[serde(tag = "k")]
213357
enum EreportClass {
214358
#[serde(rename = "hw.insert.psu")]
215-
PsuInserted {
216-
#[serde(flatten)]
217-
ereport: PsuInsertedEreport,
218-
},
359+
PsuInserted,
219360
#[serde(rename = "hw.remove.psu")]
220-
PsuRemoved {
221-
#[serde(flatten)]
222-
ereport: PsuInsertedEreport,
223-
},
361+
PsuRemoved,
224362
#[serde(rename = "hw.pwr.pwr_good.bad")]
225-
PwrBad {
226-
#[serde(flatten)]
227-
ereport: PwrGoodEreport,
228-
},
229-
}
230-
231-
#[derive(Debug, Eq, PartialEq, serde::Deserialize)]
232-
struct PsuInsertedEreport {
233-
refdes: String,
234-
rail: String,
235-
slot: u8,
236-
fruid: PsuFruid,
363+
PwrBad { pmbus_status: PmbusStatus },
364+
#[serde(rename = "hw.pwr.pwr_good.good")]
365+
PwrGood { pmbus_status: PmbusStatus },
237366
}
238367

239368
#[derive(Debug, Eq, PartialEq, serde::Deserialize)]
240-
struct PwrGoodEreport {
369+
struct PsuId {
241370
refdes: String,
242371
rail: String,
243372
slot: u8,
244373
fruid: PsuFruid,
245-
pmbus_status: PmbusStatus,
246374
}
247375

248376
// These are the same field names that Hubris uses in the ereport. See:

0 commit comments

Comments
 (0)