Refactor daily CI workflow (#30012)

* separate jobs * separate jobs * use channel name directly instead of ID * use channel name directly instead of ID * use channel name directly instead of ID --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
huggingface · Apr 5, 2024 · b17b54d · b17b54d
1 parent 17cd7a9
commit b17b54d
Show file tree

Hide file tree

Showing 4 changed files with 255 additions and 125 deletions.
diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml
@@ -0,0 +1,59 @@
+name: Self-hosted runner (scheduled)
+
+
+on:
+  repository_dispatch:
+  schedule:
+    - cron: "17 2 * * *"
+  push:
+    branches:
+      - run_scheduled_ci*
+
+jobs:
+  model-ci:
+    name: Model CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_tests_gpu
+      slack_report_channel: "#transformers-ci-daily-models"
+    secrets: inherit
+
+  torch-pipeline:
+    name: Torch pipeline CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_pipelines_torch_gpu
+      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
+    secrets: inherit
+
+  tf-pipeline:
+    name: TF pipeline CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_pipelines_tf_gpu
+      slack_report_channel: "#transformers-ci-daily-pipeline-tf"
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_examples_gpu
+      slack_report_channel: "#transformers-ci-daily-examples"
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_all_tests_torch_cuda_extensions_gpu
+      slack_report_channel: "#transformers-ci-daily-deepspeed"
+    secrets: inherit
+
+  quantization-ci:
+    name: Quantization CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_tests_quantization_torch_gpu
+      slack_report_channel: "#transformers-ci-daily-quantization"
+    secrets: inherit
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
@@ -7,12 +7,14 @@ name: Self-hosted runner (scheduled)
 # `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
 
 on:
-  repository_dispatch:
-  schedule:
-    - cron: "17 2 * * *"
-  push:
-    branches:
-      - run_scheduled_ci*
+  workflow_call:
+    inputs:
+      job:
+        required: true
+        type: string
+      slack_report_channel:
+        required: true
+        type: string
 
 env:
   HF_HOME: /mnt/cache
@@ -31,6 +33,7 @@ env:
 
 jobs:
   setup:
+    if: ${{ inputs.job == 'run_tests_gpu' }}
     name: Setup
     strategy:
       matrix:
@@ -71,6 +74,7 @@ jobs:
           nvidia-smi
 
   run_tests_gpu:
+    if: ${{ inputs.job == 'run_tests_gpu' }}
     name: " "
     needs: setup
     strategy:
@@ -85,17 +89,17 @@ jobs:
       slice_id: ${{ matrix.slice_id }}
     secrets: inherit
 
-  run_examples_gpu:
-    name: Examples directory
+  run_pipelines_torch_gpu:
+    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
+    name: PyTorch pipelines
     strategy:
       fail-fast: false
       matrix:
-        machine_type: [single-gpu]
+        machine_type: [single-gpu, multi-gpu]
     runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     container:
-      image: huggingface/transformers-all-latest-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
+      image: huggingface/transformers-pytorch-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       - name: Update clone
         working-directory: /transformers
@@ -118,39 +122,39 @@ jobs:
         working-directory: /transformers
         run: pip freeze
 
-      - name: Run examples tests on GPU
+      - name: Run all pipeline tests on GPU
         working-directory: /transformers
         run: |
-          pip install -r examples/pytorch/_tests_requirements.txt
-          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
 
       - name: Failure short reports
         if: ${{ failure() }}
         continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
 
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu"
+      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu"
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_examples_gpu
-          path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
+          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
 
-  run_pipelines_torch_gpu:
-    name: PyTorch pipelines
+  run_pipelines_tf_gpu:
+    if: ${{ inputs.job == 'run_pipelines_tf_gpu' }}
+    name: TensorFlow pipelines
     strategy:
       fail-fast: false
       matrix:
         machine_type: [single-gpu, multi-gpu]
     runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     container:
-      image: huggingface/transformers-pytorch-gpu
+      image: huggingface/transformers-tensorflow-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
     steps:
       - name: Update clone
         working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: |
+          git fetch && git checkout ${{ github.sha }}
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers
@@ -172,36 +176,35 @@ jobs:
       - name: Run all pipeline tests on GPU
         working-directory: /transformers
         run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines
 
       - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
+        if: ${{ always() }}
+        run: |
+          cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt
 
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu"
+      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu"
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
+          name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu
 
-  run_pipelines_tf_gpu:
-    name: TensorFlow pipelines
+  run_examples_gpu:
+    if: ${{ inputs.job == 'run_examples_gpu' }}
+    name: Examples directory
     strategy:
       fail-fast: false
       matrix:
-        machine_type: [single-gpu, multi-gpu]
+        machine_type: [single-gpu]
     runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     container:
-      image: huggingface/transformers-tensorflow-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       - name: Update clone
         working-directory: /transformers
-        run: |
-          git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ github.sha }}
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers
@@ -220,31 +223,32 @@ jobs:
         working-directory: /transformers
         run: pip freeze
 
-      - name: Run all pipeline tests on GPU
+      - name: Run examples tests on GPU
         working-directory: /transformers
         run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines
+          pip install -r examples/pytorch/_tests_requirements.txt
+          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
 
       - name: Failure short reports
-        if: ${{ always() }}
-        run: |
-          cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
 
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu"
+      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu"
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu
+          name: ${{ matrix.machine_type }}_run_examples_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
 
   run_all_tests_torch_cuda_extensions_gpu:
+    if: ${{ inputs.job == 'run_all_tests_torch_cuda_extensions_gpu' }}
     name: Torch CUDA extension tests
     strategy:
       fail-fast: false
       matrix:
         machine_type: [single-gpu, multi-gpu]
     runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
-    needs: setup
     container:
       image: huggingface/transformers-pytorch-deepspeed-latest-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -298,6 +302,7 @@ jobs:
           path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
 
   run_tests_quantization_torch_gpu:
+    if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
     name: Quantization tests
     strategy:
       fail-fast: false
@@ -307,7 +312,6 @@ jobs:
     container:
       image: huggingface/transformers-quantization-latest-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
     steps:
       - name: Update clone
         working-directory: /transformers
@@ -348,18 +352,11 @@ jobs:
           path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu
 
   run_extract_warnings:
+    # Let's only do this for the job `run_tests_gpu` to simplify the (already complex) logic.
+    if: ${{ always() && inputs.job == 'run_tests_gpu' }}
     name: Extract warnings in CI artifacts
     runs-on: ubuntu-22.04
-    if: always()
-    needs: [
-      setup,
-      run_tests_gpu,
-      run_examples_gpu,
-      run_pipelines_tf_gpu,
-      run_pipelines_torch_gpu,
-      run_all_tests_torch_cuda_extensions_gpu,
-      run_tests_quantization_torch_gpu,
-    ]
+    needs: [setup, run_tests_gpu]
     steps:
       - name: Checkout transformers
         uses: actions/checkout@v3
@@ -396,52 +393,24 @@ jobs:
           path: warnings_in_ci/selected_warnings.json
 
   send_results:
-    name: Send results to webhook
-    runs-on: ubuntu-22.04
-    if: always()
+    name: Slack Report
     needs: [
       setup,
       run_tests_gpu,
-      run_examples_gpu,
-      run_pipelines_tf_gpu,
       run_pipelines_torch_gpu,
+      run_pipelines_tf_gpu,
+      run_examples_gpu,
       run_all_tests_torch_cuda_extensions_gpu,
       run_tests_quantization_torch_gpu,
       run_extract_warnings
     ]
-    steps:
-      - name: Preliminary job status
-        shell: bash
-        # For the meaning of these environment variables, see the job `Setup`
-        run: |
-          echo "Setup status: ${{ needs.setup.result }}"
-
-      - uses: actions/checkout@v3
-      - uses: actions/download-artifact@v3
-      - name: Send message to Slack
-        env:
-          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
-          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
-          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
-          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          CI_EVENT: scheduled
-          CI_SHA: ${{ github.sha }}
-          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
-          SETUP_STATUS: ${{ needs.setup.result }}
-        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
-        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
-        run: |
-          sudo apt-get install -y curl
-          pip install slack_sdk
-          pip show slack_sdk
-          python utils/notification_service.py "${{ needs.setup.outputs.folder_slices }}"
-
-      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
-      - name: Failure table artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: prev_ci_results
-          path: prev_ci_results
+    if: ${{ always() }}
+    uses: ./.github/workflows/slack-report.yml
+    with:
+      job: ${{ inputs.job }}
+      # This would be `skipped` if `setup` is skipped.
+      setup_status: ${{ needs.setup.result }}
+      slack_report_channel: ${{ inputs.slack_report_channel }}
+      # This would be an empty string if `setup` is skipped.
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+    secrets: inherit