[CI]Enable CI for TorchAO on XPU Using Docker (#3027)

DiweiSun · chuanqi129 · web-flow · commit b633f89ada80 · 2025-10-15T09:02:09.000+08:00
* enable xpu ci test
Co-authored-by: Wang, Chuanqi &lt;chuanqi.wang@intel.com&gt;
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -4,3 +4,4 @@ ciflow_push_tags:
 - ciflow/tutorials
 - ciflow/rocm
 - ciflow/4xh100
+- ciflow/xpu
diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+conda create -yn xpu_ao_ci python=3.10
+source activate xpu_ao_ci
+
+export CC=/usr/bin/gcc
+export CXX=/usr/bin/g++
+export SCCACHE_DISABLE=1
+
+python3 -m pip install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/nightly/xpu --force-reinstall --no-cache-dir 
+cd torchao && python3 setup.py install && cd ..
+
+python3 -c "import torch; import torchao; print(f'Torch version: {torch.__version__}')"
+
+pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'
+
+pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py 
diff --git a/.github/workflows/xpu_test.yml b/.github/workflows/xpu_test.yml
@@ -0,0 +1,243 @@
+# TODO: this looks sort of similar to _linux-test, but there are like a dozen
+# places where you would have to insert an if statement. Probably it's better to
+# just use a different workflow altogether
+
+name: xpu-test
+
+on:
+  push:
+    tags:
+      - ciflow/xpu/*
+
+permissions:
+  id-token: write
+  contents: read
+
+concurrency:
+  group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    # Don't run on forked repos or empty test matrix
+    # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
+    timeout-minutes: 60
+    runs-on: linux.idc.xpu
+    env:
+      DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3
+      PYTORCH_RETRY_TEST_CASES: 1
+      PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
+      XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/pytorch
+          ref: nightly
+          path: pytorch
+          fetch-depth: 1
+          submodules: false
+    
+      - name: Checkout Torchao (ao)
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ github.repository }}
+          ref: ${{ github.head_ref || github.ref }}
+          path: torchao
+          fetch-depth: 1
+          submodules: recursive
+
+      - name: Clean all stopped docker containers
+        if: always()
+        shell: bash
+        run: |
+          # Prune all stopped containers.
+          # If other runner is pruning on this node, will skip.
+          nprune=$(ps -ef | grep -c "docker container prune")
+          if [[ $nprune -eq 1 ]]; then
+            docker container prune -f
+          fi
+
+      - name: Runner health check system info
+        if: always()
+        shell: bash
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/oneAPI.list || true
+          cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true
+          whoami
+
+      - name: Runner health check xpu-smi
+        if: always()
+        shell: bash
+        run: |
+          timeout 30 xpu-smi discovery || true
+
+      - name: Runner health check GPU count
+        if: always()
+        shell: bash
+        run: |
+          ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true)
+          msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
+          if [[ $ngpu -eq 0 ]]; then
+            echo "Error: Failed to detect any GPUs on the runner"
+            echo "$msg"
+            exit 1
+          fi
+
+      - name: Runner diskspace health check
+        uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
+        if: always()
+
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        shell: bash
+        run: |
+          killall runsvc.sh
+
+      - name: Preserve github env variables for use in docker
+        shell: bash
+        run: |
+          env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+
+      - name: XPU set GPU_FLAG
+        shell: bash
+        run: |
+          # Add render group for container creation.
+          render_gid=`cat /etc/group | grep render | cut -d: -f3`
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"
+
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: ${{ env.DOCKER_IMAGE }}
+          working-directory: pytorch
+          repo-name: pytorch
+
+      - name: Use following to pull public copy of the image
+        id: print-ghcr-mirror
+        env:
+          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        shell: bash
+        run: |
+          tag=${ECR_DOCKER_IMAGE##*:}
+          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Runner health check GPU count
+        if: always()
+        shell: bash
+        run: |
+          ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
+          msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
+          if [[ $ngpu -eq 0 ]]; then
+            echo "Error: Failed to detect any GPUs on the runner"
+            echo "$msg"
+            exit 1
+          fi
+
+      - name: Test
+        id: test
+        env:
+          TEST_COMMAND: torchao/.github/scripts/ci_test_xpu.sh
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+        timeout-minutes: 60
+        run: |
+          set -x
+
+          # detached container should get cleaned up by teardown_ec2_linux
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e PR_NUMBER \
+            -e GITHUB_ACTIONS \
+            -e GITHUB_REPOSITORY \
+            -e GITHUB_WORKFLOW \
+            -e GITHUB_JOB \
+            -e GITHUB_RUN_ID \
+            -e GITHUB_RUN_NUMBER \
+            -e GITHUB_RUN_ATTEMPT \
+            -e JOB_ID \
+            -e BRANCH \
+            -e SHA1 \
+            --user $(id -u):$(id -g) \
+            --ulimit stack=10485760:83886080 \
+            --ulimit core=0 \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="8g" \
+            --tty \
+            --detach \
+            --name="${container_name}" \
+            --user jenkins \
+            --privileged \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          # save container name for later step
+          echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
+          # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
+          docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"
+
+      - name: Collect backtraces from coredumps (if any)
+        if: always()
+        run: |
+          # shellcheck disable=SC2156
+          find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
+
+      - name: Stop container before exit
+        if: always()
+        run: |
+          # Workaround for multiple runners on same IDC node
+          docker stop "${{ env.CONTAINER_NAME }}"
+
+      - name: Store Core dumps on GitHub
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        if: failure()
+        with:
+          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ./**/core.[1-9]*
+
+      - name: Teardown XPU
+        if: always()
+        shell: bash
+        run: |
+          # Prune all stopped containers.
+          # If other runner is pruning on this node, will skip.
+          nprune=$(ps -ef | grep -c "docker container prune")
+          if [[ $nprune -eq 1 ]]; then
+            docker container prune -f
+          fi
+      
+      - name: Runner diskspace health check
+        uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
+        if: always()