Skip to content

Commit 5ac3c51

Browse files
authored
Track the CI (model) jobs that don't produce test output files (process being killed etc.) (#40981)
* fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
1 parent d9d7f6a commit 5ac3c51

File tree

2 files changed

+24
-2
lines changed

2 files changed

+24
-2
lines changed

.github/workflows/model_jobs.yml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,10 +138,16 @@ jobs:
138138
- name: Run all tests on GPU
139139
working-directory: /transformers
140140
run: |
141-
PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}
141+
script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
142+
ls -la
143+
# Extract the exit code from the output file
144+
PYTEST_EXIT_CODE=$(tail -1 test_outputs.txt | grep "PYTEST_EXIT_CODE:" | cut -d: -f2)
145+
exit ${PYTEST_EXIT_CODE:-1}
142146
143147
- name: Failure short reports
144148
if: ${{ failure() }}
149+
# This step is only to show information on Github Actions log.
150+
# Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist
145151
continue-on-error: true
146152
run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt
147153

@@ -151,6 +157,12 @@ jobs:
151157
run: |
152158
cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt
153159
160+
- name: Copy test_outputs.txt
161+
if: ${{ always() }}
162+
continue-on-error: true
163+
run: |
164+
cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
165+
154166
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
155167
if: ${{ always() }}
156168
uses: actions/upload-artifact@v4

utils/notification_service.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,9 +158,11 @@ def __init__(
158158
self.n_model_failures = (
159159
self.n_model_single_gpu_failures + self.n_model_multi_gpu_failures + self.n_model_unknown_failures
160160
)
161+
self.n_model_jobs_errored_out = sum(r["error"] for r in model_results.values())
161162

162163
# Failures and success of the additional tests
163164
self.n_additional_success = sum(r["success"] for r in additional_results.values())
165+
self.n_additional_jobs_errored_out = sum(r["error"] for r in additional_results.values())
164166

165167
if len(additional_results) > 0:
166168
# `dicts_to_sum` uses `dicts_to_sum` which requires a non empty dictionary. Let's just add an empty entry.
@@ -183,6 +185,7 @@ def __init__(
183185
self.n_failures = self.n_model_failures + self.n_additional_failures
184186
self.n_success = self.n_model_success + self.n_additional_success
185187
self.n_tests = self.n_failures + self.n_success
188+
self.n_jobs_errored_out = self.n_model_jobs_errored_out + self.n_additional_jobs_errored_out
186189

187190
self.model_results = model_results
188191
self.additional_results = additional_results
@@ -241,6 +244,7 @@ def failures(self) -> dict:
241244
"type": "plain_text",
242245
"text": (
243246
f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n"
247+
f"🚨 There were {self.n_jobs_errored_out} jobs errored out (not producing test output files).\n"
244248
f"The suite ran in {self.time}."
245249
),
246250
"emoji": True,
@@ -561,7 +565,7 @@ def payload(self) -> str:
561565
if self.ci_title:
562566
blocks.append(self.ci_title_section)
563567

564-
if self.n_model_failures > 0 or self.n_additional_failures > 0:
568+
if self.n_model_failures > 0 or self.n_additional_failures > 0 or self.n_jobs_errored_out > 0:
565569
blocks.append(self.failures)
566570

567571
if self.n_model_failures > 0:
@@ -1194,6 +1198,7 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:
11941198
"success": 0,
11951199
"skipped": 0,
11961200
"time_spent": [],
1201+
"error": False,
11971202
"failures": {},
11981203
"job_link": {},
11991204
"captured_info": {},
@@ -1222,6 +1227,11 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:
12221227
continue
12231228

12241229
artifact = retrieve_artifact(path, artifact_gpu)
1230+
1231+
if "summary_short" not in artifact:
1232+
# The process might be killed (for example, CPU OOM), or the job is canceled for some reason), etc.
1233+
matrix_job_results[matrix_name]["error"] = True
1234+
12251235
if "stats" in artifact:
12261236
# Link to the GitHub Action job
12271237
job = artifact_name_to_job_map[path]

0 commit comments

Comments
 (0)