|
| 1 | +# TODO: this looks sort of similar to _linux-test, but there are like a dozen |
| 2 | +# places where you would have to insert an if statement. Probably it's better to |
| 3 | +# just use a different workflow altogether |
| 4 | + |
| 5 | +name: xpu-test |
| 6 | + |
| 7 | +on: |
| 8 | + push: |
| 9 | + tags: |
| 10 | + - ciflow/xpu/* |
| 11 | + |
| 12 | +permissions: |
| 13 | + id-token: write |
| 14 | + contents: read |
| 15 | + |
| 16 | +concurrency: |
| 17 | + group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} |
| 18 | + cancel-in-progress: true |
| 19 | + |
| 20 | +jobs: |
| 21 | + test: |
| 22 | + # Don't run on forked repos or empty test matrix |
| 23 | + # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' |
| 24 | + timeout-minutes: 60 |
| 25 | + runs-on: linux.idc.xpu |
| 26 | + env: |
| 27 | + DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3 |
| 28 | + PYTORCH_RETRY_TEST_CASES: 1 |
| 29 | + PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 |
| 30 | + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla |
| 31 | + steps: |
| 32 | + - name: Checkout PyTorch |
| 33 | + uses: actions/checkout@v4 |
| 34 | + with: |
| 35 | + repository: pytorch/pytorch |
| 36 | + ref: nightly |
| 37 | + path: pytorch |
| 38 | + fetch-depth: 1 |
| 39 | + submodules: false |
| 40 | + |
| 41 | + - name: Checkout Torchao (ao) |
| 42 | + uses: actions/checkout@v4 |
| 43 | + with: |
| 44 | + repository: ${{ github.repository }} |
| 45 | + ref: ${{ github.head_ref || github.ref }} |
| 46 | + path: torchao |
| 47 | + fetch-depth: 1 |
| 48 | + submodules: recursive |
| 49 | + |
| 50 | + - name: Clean all stopped docker containers |
| 51 | + if: always() |
| 52 | + shell: bash |
| 53 | + run: | |
| 54 | + # Prune all stopped containers. |
| 55 | + # If other runner is pruning on this node, will skip. |
| 56 | + nprune=$(ps -ef | grep -c "docker container prune") |
| 57 | + if [[ $nprune -eq 1 ]]; then |
| 58 | + docker container prune -f |
| 59 | + fi |
| 60 | +
|
| 61 | + - name: Runner health check system info |
| 62 | + if: always() |
| 63 | + shell: bash |
| 64 | + run: | |
| 65 | + cat /etc/os-release || true |
| 66 | + cat /etc/apt/sources.list.d/oneAPI.list || true |
| 67 | + cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true |
| 68 | + whoami |
| 69 | +
|
| 70 | + - name: Runner health check xpu-smi |
| 71 | + if: always() |
| 72 | + shell: bash |
| 73 | + run: | |
| 74 | + timeout 30 xpu-smi discovery || true |
| 75 | +
|
| 76 | + - name: Runner health check GPU count |
| 77 | + if: always() |
| 78 | + shell: bash |
| 79 | + run: | |
| 80 | + ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true) |
| 81 | + msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" |
| 82 | + if [[ $ngpu -eq 0 ]]; then |
| 83 | + echo "Error: Failed to detect any GPUs on the runner" |
| 84 | + echo "$msg" |
| 85 | + exit 1 |
| 86 | + fi |
| 87 | +
|
| 88 | + - name: Runner diskspace health check |
| 89 | + uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main |
| 90 | + if: always() |
| 91 | + |
| 92 | + - name: Runner health check disconnect on failure |
| 93 | + if: ${{ failure() }} |
| 94 | + shell: bash |
| 95 | + run: | |
| 96 | + killall runsvc.sh |
| 97 | +
|
| 98 | + - name: Preserve github env variables for use in docker |
| 99 | + shell: bash |
| 100 | + run: | |
| 101 | + env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" |
| 102 | + env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" |
| 103 | +
|
| 104 | + - name: XPU set GPU_FLAG |
| 105 | + shell: bash |
| 106 | + run: | |
| 107 | + # Add render group for container creation. |
| 108 | + render_gid=`cat /etc/group | grep render | cut -d: -f3` |
| 109 | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}" |
| 110 | +
|
| 111 | + - name: configure aws credentials |
| 112 | + id: aws_creds |
| 113 | + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 |
| 114 | + with: |
| 115 | + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only |
| 116 | + aws-region: us-east-1 |
| 117 | + |
| 118 | + - name: Login to Amazon ECR |
| 119 | + id: login-ecr |
| 120 | + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 |
| 121 | + |
| 122 | + - name: Calculate docker image |
| 123 | + id: calculate-docker-image |
| 124 | + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main |
| 125 | + with: |
| 126 | + docker-image-name: ${{ env.DOCKER_IMAGE }} |
| 127 | + working-directory: pytorch |
| 128 | + repo-name: pytorch |
| 129 | + |
| 130 | + - name: Use following to pull public copy of the image |
| 131 | + id: print-ghcr-mirror |
| 132 | + env: |
| 133 | + ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} |
| 134 | + shell: bash |
| 135 | + run: | |
| 136 | + tag=${ECR_DOCKER_IMAGE##*:} |
| 137 | + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" |
| 138 | +
|
| 139 | + - name: Pull docker image |
| 140 | + uses: pytorch/test-infra/.github/actions/pull-docker-image@main |
| 141 | + with: |
| 142 | + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} |
| 143 | + |
| 144 | + - name: Runner health check GPU count |
| 145 | + if: always() |
| 146 | + shell: bash |
| 147 | + run: | |
| 148 | + ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true) |
| 149 | + msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified" |
| 150 | + if [[ $ngpu -eq 0 ]]; then |
| 151 | + echo "Error: Failed to detect any GPUs on the runner" |
| 152 | + echo "$msg" |
| 153 | + exit 1 |
| 154 | + fi |
| 155 | +
|
| 156 | + - name: Test |
| 157 | + id: test |
| 158 | + env: |
| 159 | + TEST_COMMAND: torchao/.github/scripts/ci_test_xpu.sh |
| 160 | + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} |
| 161 | + PR_NUMBER: ${{ github.event.pull_request.number }} |
| 162 | + GITHUB_REPOSITORY: ${{ github.repository }} |
| 163 | + GITHUB_WORKFLOW: ${{ github.workflow }} |
| 164 | + GITHUB_JOB: ${{ github.job }} |
| 165 | + GITHUB_RUN_ID: ${{ github.run_id }} |
| 166 | + GITHUB_RUN_NUMBER: ${{ github.run_number }} |
| 167 | + GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} |
| 168 | + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} |
| 169 | + timeout-minutes: 60 |
| 170 | + run: | |
| 171 | + set -x |
| 172 | +
|
| 173 | + # detached container should get cleaned up by teardown_ec2_linux |
| 174 | + # Used for GPU_FLAG since that doesn't play nice |
| 175 | + # shellcheck disable=SC2086,SC2090 |
| 176 | + container_name=$(docker run \ |
| 177 | + ${GPU_FLAG:-} \ |
| 178 | + -e PR_NUMBER \ |
| 179 | + -e GITHUB_ACTIONS \ |
| 180 | + -e GITHUB_REPOSITORY \ |
| 181 | + -e GITHUB_WORKFLOW \ |
| 182 | + -e GITHUB_JOB \ |
| 183 | + -e GITHUB_RUN_ID \ |
| 184 | + -e GITHUB_RUN_NUMBER \ |
| 185 | + -e GITHUB_RUN_ATTEMPT \ |
| 186 | + -e JOB_ID \ |
| 187 | + -e BRANCH \ |
| 188 | + -e SHA1 \ |
| 189 | + --user $(id -u):$(id -g) \ |
| 190 | + --ulimit stack=10485760:83886080 \ |
| 191 | + --ulimit core=0 \ |
| 192 | + --security-opt seccomp=unconfined \ |
| 193 | + --cap-add=SYS_PTRACE \ |
| 194 | + --shm-size="8g" \ |
| 195 | + --tty \ |
| 196 | + --detach \ |
| 197 | + --name="${container_name}" \ |
| 198 | + --user jenkins \ |
| 199 | + --privileged \ |
| 200 | + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ |
| 201 | + -w /var/lib/jenkins/workspace \ |
| 202 | + "${DOCKER_IMAGE}" |
| 203 | + ) |
| 204 | + # save container name for later step |
| 205 | + echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" |
| 206 | + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home |
| 207 | + docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}" |
| 208 | +
|
| 209 | + - name: Collect backtraces from coredumps (if any) |
| 210 | + if: always() |
| 211 | + run: | |
| 212 | + # shellcheck disable=SC2156 |
| 213 | + find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; |
| 214 | +
|
| 215 | + - name: Stop container before exit |
| 216 | + if: always() |
| 217 | + run: | |
| 218 | + # Workaround for multiple runners on same IDC node |
| 219 | + docker stop "${{ env.CONTAINER_NAME }}" |
| 220 | +
|
| 221 | + - name: Store Core dumps on GitHub |
| 222 | + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 |
| 223 | + if: failure() |
| 224 | + with: |
| 225 | + name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} |
| 226 | + retention-days: 14 |
| 227 | + if-no-files-found: ignore |
| 228 | + path: ./**/core.[1-9]* |
| 229 | + |
| 230 | + - name: Teardown XPU |
| 231 | + if: always() |
| 232 | + shell: bash |
| 233 | + run: | |
| 234 | + # Prune all stopped containers. |
| 235 | + # If other runner is pruning on this node, will skip. |
| 236 | + nprune=$(ps -ef | grep -c "docker container prune") |
| 237 | + if [[ $nprune -eq 1 ]]; then |
| 238 | + docker container prune -f |
| 239 | + fi |
| 240 | + |
| 241 | + - name: Runner diskspace health check |
| 242 | + uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main |
| 243 | + if: always() |
0 commit comments