@@ -11,20 +11,42 @@ use crate::alert;
1111use crate :: ereport_analysis;
1212use nexus_types:: fm:: DiagnosisEngineKind ;
1313use nexus_types:: fm:: Ereport ;
14+ use nexus_types:: fm:: case:: CaseEreport ;
15+ use nexus_types:: fm:: case:: ImpactedSpSlot ;
1416use nexus_types:: fm:: ereport;
1517use nexus_types:: inventory:: SpType ;
18+ use omicron_uuid_kinds:: CaseUuid ;
1619use serde:: de:: DeserializeOwned ;
1720use serde_json:: Value ;
21+ use std:: collections:: HashMap ;
1822use std:: sync:: Arc ;
1923
2024pub struct PowerShelfDiagnosis {
2125 log : slog:: Logger ,
22- // TODO(eliza): does this need/want any internal state?
26+ cases_by_shelf : [ HashMap < CaseUuid , PscCase > ; 2 ] ,
2327}
2428
29+ #[ derive( Default ) ]
30+ struct PscCase {
31+ psus_impacted : PsuSet ,
32+ }
33+
34+ type PsuSet = [ bool ; N_PSUS ] ;
35+ const N_PSUS : usize = 6 ;
36+
37+ const KNOWN_EREPORT_CLASSES : & [ & str ] = & [
38+ "hw.remove.psu" ,
39+ "hw.insert.psu" ,
40+ "hw.pwr.pwr_good.good" ,
41+ "hw.pwr.pwr_good.bad" ,
42+ ] ;
43+
2544impl PowerShelfDiagnosis {
2645 pub fn new ( log : & slog:: Logger ) -> Self {
27- Self { log : log. new ( slog:: o!( "de" => "power_shelf" ) ) }
46+ Self {
47+ log : log. new ( slog:: o!( "de" => "power_shelf" ) ) ,
48+ cases_by_shelf : [ HashMap :: new ( ) , HashMap :: new ( ) ] ,
49+ }
2850 }
2951}
3052
@@ -38,7 +60,126 @@ impl DiagnosisEngine for PowerShelfDiagnosis {
3860 sitrep : & mut SitrepBuilder < ' _ > ,
3961 case : & mut CaseBuilder ,
4062 ) -> anyhow:: Result < ( ) > {
41- todo ! ( )
63+ slog:: debug!(
64+ self . log,
65+ "analyzing open case from parent sitrep..." ;
66+ "case_id" => %case. id
67+ ) ;
68+
69+ // ooh, a case we alerady opened! let's figure out what its deal is...
70+ for & ImpactedSpSlot { sp_type, slot, ref comment, .. } in
71+ & case. impacted_sp_slots
72+ {
73+ // skip non-PSC impacts
74+ if sp_type != SpType :: Power {
75+ continue ;
76+ }
77+
78+ if matches ! ( slot, 0 | 1 ) {
79+ slog:: debug!(
80+ & self . log,
81+ "open case impacts power shelf {slot}" ;
82+ "case_id" => %case. id,
83+ "power_shelf" => slot,
84+ "comment" => %comment,
85+ ) ;
86+ // make sure it's tracked.
87+ self . cases_by_shelf [ slot as usize ] . entry ( case. id ) . or_default ( ) ;
88+ } else {
89+ slog:: warn!(
90+ & self . log,
91+ "this is weird: I only know about power shelves numbered \
92+ 1 and 0, but found a case that claims to impact power \
93+ shelf {slot}";
94+ "case_id" => %case. id,
95+ "power_shelf" => slot,
96+ "comment" => %comment,
97+ ) ;
98+ }
99+ }
100+
101+ for CaseEreport { ereport, comment, assigned_sitrep_id } in
102+ & case. ereports
103+ {
104+ let class = match & ereport. class {
105+ // This is one we care about
106+ Some ( ref class)
107+ if KNOWN_EREPORT_CLASSES . contains ( & class. as_ref ( ) ) =>
108+ {
109+ slog:: debug!(
110+ self . log,
111+ "analyzing ereport assigned to open case..." ;
112+ "case_id" => %case. id,
113+ "ereport_id" => %ereport. id,
114+ "ereport_class" => %class,
115+ "comment" => %comment,
116+ "assigned_sitrep_id" => %assigned_sitrep_id,
117+ ) ;
118+ class
119+ }
120+ class => {
121+ slog:: debug!(
122+ & self . log,
123+ "an ereport with an unknown or missing class was \
124+ assigned to this case (presumably by another DE); \
125+ skipping it...";
126+ "case_id" => %case. id,
127+ "ereport_id" => %ereport. id,
128+ "ereport_class" => ?class,
129+ "comment" => %comment,
130+ ) ;
131+ continue ;
132+ }
133+ } ;
134+
135+ let ereport:: Reporter :: Sp { sp_type : SpType :: Power , slot : shelf } =
136+ ereport. reporter
137+ else {
138+ slog:: debug!(
139+ self . log,
140+ "skipping ereport that was not reported by a power shelf" ;
141+ "case_id" => %case. id,
142+ "ereport_id" => %ereport. id,
143+ "ereport_class" => %class,
144+ "ereport_id" => %ereport. id,
145+ "reporter" => %ereport. reporter,
146+ ) ;
147+ continue ;
148+ } ;
149+
150+ let tracked_case =
151+ self . cases_by_shelf [ shelf as usize ] . entry ( case. id ) . or_default ( ) ;
152+
153+ // Does the ereport include a PSU slot?
154+ if let Some ( slot) = ereport. report [ "slot" ] . as_u64 ( ) {
155+ let slot = slot as usize ;
156+ if slot >= N_PSUS {
157+ slog:: warn!(
158+ & self . log,
159+ "this is weird: I only know about power shelves with \
160+ {N_PSUS} PSU SLOTS, but this ereport claims to \
161+ involve slot {slot}";
162+ "case_id" => %case. id,
163+ "ereport_id" => %ereport. id,
164+ "ereport_class" => %class,
165+ "slot" => slot,
166+ )
167+ } else {
168+ slog:: debug!(
169+ & self . log,
170+ "found an ereport associated with PSU slot {slot}" ;
171+ "case_id" => %case. id,
172+ "ereport_id" => %ereport. id,
173+ "ereport_class" => %class,
174+ "shelf" => shelf,
175+ "slot" => slot,
176+ ) ;
177+ tracked_case. psus_impacted [ slot] = true ;
178+ }
179+ }
180+ }
181+
182+ Ok ( ( ) )
42183 }
43184
44185 fn analyze_ereport (
@@ -204,6 +345,9 @@ fn grab_json_value<T: DeserializeOwned>(
204345struct PscEreport {
205346 #[ serde( flatten) ]
206347 metadata : ereport_analysis:: HubrisMetadata ,
348+
349+ #[ serde( flatten) ]
350+ psu : PsuId ,
207351 #[ serde( flatten) ]
208352 class : EreportClass ,
209353}
@@ -212,37 +356,21 @@ struct PscEreport {
212356#[ serde( tag = "k" ) ]
213357enum EreportClass {
214358 #[ serde( rename = "hw.insert.psu" ) ]
215- PsuInserted {
216- #[ serde( flatten) ]
217- ereport : PsuInsertedEreport ,
218- } ,
359+ PsuInserted ,
219360 #[ serde( rename = "hw.remove.psu" ) ]
220- PsuRemoved {
221- #[ serde( flatten) ]
222- ereport : PsuInsertedEreport ,
223- } ,
361+ PsuRemoved ,
224362 #[ serde( rename = "hw.pwr.pwr_good.bad" ) ]
225- PwrBad {
226- #[ serde( flatten) ]
227- ereport : PwrGoodEreport ,
228- } ,
229- }
230-
231- #[ derive( Debug , Eq , PartialEq , serde:: Deserialize ) ]
232- struct PsuInsertedEreport {
233- refdes : String ,
234- rail : String ,
235- slot : u8 ,
236- fruid : PsuFruid ,
363+ PwrBad { pmbus_status : PmbusStatus } ,
364+ #[ serde( rename = "hw.pwr.pwr_good.good" ) ]
365+ PwrGood { pmbus_status : PmbusStatus } ,
237366}
238367
239368#[ derive( Debug , Eq , PartialEq , serde:: Deserialize ) ]
240- struct PwrGoodEreport {
369+ struct PsuId {
241370 refdes : String ,
242371 rail : String ,
243372 slot : u8 ,
244373 fruid : PsuFruid ,
245- pmbus_status : PmbusStatus ,
246374}
247375
248376// These are the same field names that Hubris uses in the ereport. See:
0 commit comments