Track the CI (model) jobs that don't produce test output files (process being killed etc.) (#40981)

ydshieh · web-flow · commit 5ac3c5171a6b · 2025-09-18T18:27:27.000+02:00
* fix

* fix

---------

Co-authored-by: ydshieh &lt;ydshieh@users.noreply.github.com&gt;
diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml
@@ -138,10 +138,16 @@ jobs:
       - name: Run all tests on GPU
         working-directory: /transformers
         run: |
-          PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}
+          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
+          ls -la
+          # Extract the exit code from the output file
+          PYTEST_EXIT_CODE=$(tail -1 test_outputs.txt | grep "PYTEST_EXIT_CODE:" | cut -d: -f2)
+          exit ${PYTEST_EXIT_CODE:-1}
 
       - name: Failure short reports
         if: ${{ failure() }}
+        # This step is only to show information on Github Actions log.
+        # Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist
         continue-on-error: true
         run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt
 
@@ -151,6 +157,12 @@ jobs:
         run: |
           cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt
 
+      - name: Copy test_outputs.txt
+        if: ${{ always() }}
+        continue-on-error: true
+        run: |
+          cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+
       - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
diff --git a/utils/notification_service.py b/utils/notification_service.py
@@ -158,9 +158,11 @@ def __init__(
         self.n_model_failures = (
             self.n_model_single_gpu_failures + self.n_model_multi_gpu_failures + self.n_model_unknown_failures
         )
+        self.n_model_jobs_errored_out = sum(r["error"] for r in model_results.values())
 
         # Failures and success of the additional tests
         self.n_additional_success = sum(r["success"] for r in additional_results.values())
+        self.n_additional_jobs_errored_out = sum(r["error"] for r in additional_results.values())
 
         if len(additional_results) > 0:
             # `dicts_to_sum` uses `dicts_to_sum` which requires a non empty dictionary. Let's just add an empty entry.
@@ -183,6 +185,7 @@ def __init__(
         self.n_failures = self.n_model_failures + self.n_additional_failures
         self.n_success = self.n_model_success + self.n_additional_success
         self.n_tests = self.n_failures + self.n_success
+        self.n_jobs_errored_out = self.n_model_jobs_errored_out + self.n_additional_jobs_errored_out
 
         self.model_results = model_results
         self.additional_results = additional_results
@@ -241,6 +244,7 @@ def failures(self) -> dict:
                 "type": "plain_text",
                 "text": (
                     f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n"
+                    f"🚨 There were {self.n_jobs_errored_out} jobs errored out (not producing test output files).\n"
                     f"The suite ran in {self.time}."
                 ),
                 "emoji": True,
@@ -561,7 +565,7 @@ def payload(self) -> str:
         if self.ci_title:
             blocks.append(self.ci_title_section)
 
-        if self.n_model_failures > 0 or self.n_additional_failures > 0:
+        if self.n_model_failures > 0 or self.n_additional_failures > 0 or self.n_jobs_errored_out > 0:
             blocks.append(self.failures)
 
         if self.n_model_failures > 0:
@@ -1194,6 +1198,7 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:
             "success": 0,
             "skipped": 0,
             "time_spent": [],
+            "error": False,
             "failures": {},
             "job_link": {},
             "captured_info": {},
@@ -1222,6 +1227,11 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:
                 continue
 
             artifact = retrieve_artifact(path, artifact_gpu)
+
+            if "summary_short" not in artifact:
+                # The process might be killed (for example, CPU OOM), or the job is canceled for some reason), etc.
+                matrix_job_results[matrix_name]["error"] = True
+
             if "stats" in artifact:
                 # Link to the GitHub Action job
                 job = artifact_name_to_job_map[path]