@@ -158,9 +158,11 @@ def __init__(
158158 self .n_model_failures = (
159159 self .n_model_single_gpu_failures + self .n_model_multi_gpu_failures + self .n_model_unknown_failures
160160 )
161+ self .n_model_jobs_errored_out = sum (r ["error" ] for r in model_results .values ())
161162
162163 # Failures and success of the additional tests
163164 self .n_additional_success = sum (r ["success" ] for r in additional_results .values ())
165+ self .n_additional_jobs_errored_out = sum (r ["error" ] for r in additional_results .values ())
164166
165167 if len (additional_results ) > 0 :
166168 # `dicts_to_sum` uses `dicts_to_sum` which requires a non empty dictionary. Let's just add an empty entry.
@@ -183,6 +185,7 @@ def __init__(
183185 self .n_failures = self .n_model_failures + self .n_additional_failures
184186 self .n_success = self .n_model_success + self .n_additional_success
185187 self .n_tests = self .n_failures + self .n_success
188+ self .n_jobs_errored_out = self .n_model_jobs_errored_out + self .n_additional_jobs_errored_out
186189
187190 self .model_results = model_results
188191 self .additional_results = additional_results
@@ -241,6 +244,7 @@ def failures(self) -> dict:
241244 "type" : "plain_text" ,
242245 "text" : (
243246 f"There were { self .n_failures } failures, out of { self .n_tests } tests.\n "
247+ f"🚨 There were { self .n_jobs_errored_out } jobs errored out (not producing test output files).\n "
244248 f"The suite ran in { self .time } ."
245249 ),
246250 "emoji" : True ,
@@ -561,7 +565,7 @@ def payload(self) -> str:
561565 if self .ci_title :
562566 blocks .append (self .ci_title_section )
563567
564- if self .n_model_failures > 0 or self .n_additional_failures > 0 :
568+ if self .n_model_failures > 0 or self .n_additional_failures > 0 or self . n_jobs_errored_out > 0 :
565569 blocks .append (self .failures )
566570
567571 if self .n_model_failures > 0 :
@@ -1194,6 +1198,7 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:
11941198 "success" : 0 ,
11951199 "skipped" : 0 ,
11961200 "time_spent" : [],
1201+ "error" : False ,
11971202 "failures" : {},
11981203 "job_link" : {},
11991204 "captured_info" : {},
@@ -1222,6 +1227,11 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:
12221227 continue
12231228
12241229 artifact = retrieve_artifact (path , artifact_gpu )
1230+
1231+ if "summary_short" not in artifact :
1232+ # The process might be killed (for example, CPU OOM), or the job is canceled for some reason), etc.
1233+ matrix_job_results [matrix_name ]["error" ] = True
1234+
12251235 if "stats" in artifact :
12261236 # Link to the GitHub Action job
12271237 job = artifact_name_to_job_map [path ]
0 commit comments