Skip to content

Commit b633f89

Browse files
DiweiSunchuanqi129
andauthored
[CI]Enable CI for TorchAO on XPU Using Docker (#3027)
* enable xpu ci test Co-authored-by: Wang, Chuanqi <chuanqi.wang@intel.com>
1 parent d43090a commit b633f89

File tree

3 files changed

+261
-0
lines changed

3 files changed

+261
-0
lines changed

.github/pytorch-probot.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ ciflow_push_tags:
44
- ciflow/tutorials
55
- ciflow/rocm
66
- ciflow/4xh100
7+
- ciflow/xpu

.github/scripts/ci_test_xpu.sh

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
conda create -yn xpu_ao_ci python=3.10
4+
source activate xpu_ao_ci
5+
6+
export CC=/usr/bin/gcc
7+
export CXX=/usr/bin/g++
8+
export SCCACHE_DISABLE=1
9+
10+
python3 -m pip install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/nightly/xpu --force-reinstall --no-cache-dir
11+
cd torchao && python3 setup.py install && cd ..
12+
13+
python3 -c "import torch; import torchao; print(f'Torch version: {torch.__version__}')"
14+
15+
pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'
16+
17+
pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py

.github/workflows/xpu_test.yml

Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
# TODO: this looks sort of similar to _linux-test, but there are like a dozen
2+
# places where you would have to insert an if statement. Probably it's better to
3+
# just use a different workflow altogether
4+
5+
name: xpu-test
6+
7+
on:
8+
push:
9+
tags:
10+
- ciflow/xpu/*
11+
12+
permissions:
13+
id-token: write
14+
contents: read
15+
16+
concurrency:
17+
group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
18+
cancel-in-progress: true
19+
20+
jobs:
21+
test:
22+
# Don't run on forked repos or empty test matrix
23+
# if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
24+
timeout-minutes: 60
25+
runs-on: linux.idc.xpu
26+
env:
27+
DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3
28+
PYTORCH_RETRY_TEST_CASES: 1
29+
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
30+
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
31+
steps:
32+
- name: Checkout PyTorch
33+
uses: actions/checkout@v4
34+
with:
35+
repository: pytorch/pytorch
36+
ref: nightly
37+
path: pytorch
38+
fetch-depth: 1
39+
submodules: false
40+
41+
- name: Checkout Torchao (ao)
42+
uses: actions/checkout@v4
43+
with:
44+
repository: ${{ github.repository }}
45+
ref: ${{ github.head_ref || github.ref }}
46+
path: torchao
47+
fetch-depth: 1
48+
submodules: recursive
49+
50+
- name: Clean all stopped docker containers
51+
if: always()
52+
shell: bash
53+
run: |
54+
# Prune all stopped containers.
55+
# If other runner is pruning on this node, will skip.
56+
nprune=$(ps -ef | grep -c "docker container prune")
57+
if [[ $nprune -eq 1 ]]; then
58+
docker container prune -f
59+
fi
60+
61+
- name: Runner health check system info
62+
if: always()
63+
shell: bash
64+
run: |
65+
cat /etc/os-release || true
66+
cat /etc/apt/sources.list.d/oneAPI.list || true
67+
cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true
68+
whoami
69+
70+
- name: Runner health check xpu-smi
71+
if: always()
72+
shell: bash
73+
run: |
74+
timeout 30 xpu-smi discovery || true
75+
76+
- name: Runner health check GPU count
77+
if: always()
78+
shell: bash
79+
run: |
80+
ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true)
81+
msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
82+
if [[ $ngpu -eq 0 ]]; then
83+
echo "Error: Failed to detect any GPUs on the runner"
84+
echo "$msg"
85+
exit 1
86+
fi
87+
88+
- name: Runner diskspace health check
89+
uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
90+
if: always()
91+
92+
- name: Runner health check disconnect on failure
93+
if: ${{ failure() }}
94+
shell: bash
95+
run: |
96+
killall runsvc.sh
97+
98+
- name: Preserve github env variables for use in docker
99+
shell: bash
100+
run: |
101+
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
102+
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
103+
104+
- name: XPU set GPU_FLAG
105+
shell: bash
106+
run: |
107+
# Add render group for container creation.
108+
render_gid=`cat /etc/group | grep render | cut -d: -f3`
109+
echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"
110+
111+
- name: configure aws credentials
112+
id: aws_creds
113+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
114+
with:
115+
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
116+
aws-region: us-east-1
117+
118+
- name: Login to Amazon ECR
119+
id: login-ecr
120+
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
121+
122+
- name: Calculate docker image
123+
id: calculate-docker-image
124+
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
125+
with:
126+
docker-image-name: ${{ env.DOCKER_IMAGE }}
127+
working-directory: pytorch
128+
repo-name: pytorch
129+
130+
- name: Use following to pull public copy of the image
131+
id: print-ghcr-mirror
132+
env:
133+
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
134+
shell: bash
135+
run: |
136+
tag=${ECR_DOCKER_IMAGE##*:}
137+
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
138+
139+
- name: Pull docker image
140+
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
141+
with:
142+
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
143+
144+
- name: Runner health check GPU count
145+
if: always()
146+
shell: bash
147+
run: |
148+
ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
149+
msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
150+
if [[ $ngpu -eq 0 ]]; then
151+
echo "Error: Failed to detect any GPUs on the runner"
152+
echo "$msg"
153+
exit 1
154+
fi
155+
156+
- name: Test
157+
id: test
158+
env:
159+
TEST_COMMAND: torchao/.github/scripts/ci_test_xpu.sh
160+
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
161+
PR_NUMBER: ${{ github.event.pull_request.number }}
162+
GITHUB_REPOSITORY: ${{ github.repository }}
163+
GITHUB_WORKFLOW: ${{ github.workflow }}
164+
GITHUB_JOB: ${{ github.job }}
165+
GITHUB_RUN_ID: ${{ github.run_id }}
166+
GITHUB_RUN_NUMBER: ${{ github.run_number }}
167+
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
168+
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
169+
timeout-minutes: 60
170+
run: |
171+
set -x
172+
173+
# detached container should get cleaned up by teardown_ec2_linux
174+
# Used for GPU_FLAG since that doesn't play nice
175+
# shellcheck disable=SC2086,SC2090
176+
container_name=$(docker run \
177+
${GPU_FLAG:-} \
178+
-e PR_NUMBER \
179+
-e GITHUB_ACTIONS \
180+
-e GITHUB_REPOSITORY \
181+
-e GITHUB_WORKFLOW \
182+
-e GITHUB_JOB \
183+
-e GITHUB_RUN_ID \
184+
-e GITHUB_RUN_NUMBER \
185+
-e GITHUB_RUN_ATTEMPT \
186+
-e JOB_ID \
187+
-e BRANCH \
188+
-e SHA1 \
189+
--user $(id -u):$(id -g) \
190+
--ulimit stack=10485760:83886080 \
191+
--ulimit core=0 \
192+
--security-opt seccomp=unconfined \
193+
--cap-add=SYS_PTRACE \
194+
--shm-size="8g" \
195+
--tty \
196+
--detach \
197+
--name="${container_name}" \
198+
--user jenkins \
199+
--privileged \
200+
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
201+
-w /var/lib/jenkins/workspace \
202+
"${DOCKER_IMAGE}"
203+
)
204+
# save container name for later step
205+
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
206+
# jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
207+
docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"
208+
209+
- name: Collect backtraces from coredumps (if any)
210+
if: always()
211+
run: |
212+
# shellcheck disable=SC2156
213+
find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
214+
215+
- name: Stop container before exit
216+
if: always()
217+
run: |
218+
# Workaround for multiple runners on same IDC node
219+
docker stop "${{ env.CONTAINER_NAME }}"
220+
221+
- name: Store Core dumps on GitHub
222+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
223+
if: failure()
224+
with:
225+
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
226+
retention-days: 14
227+
if-no-files-found: ignore
228+
path: ./**/core.[1-9]*
229+
230+
- name: Teardown XPU
231+
if: always()
232+
shell: bash
233+
run: |
234+
# Prune all stopped containers.
235+
# If other runner is pruning on this node, will skip.
236+
nprune=$(ps -ef | grep -c "docker container prune")
237+
if [[ $nprune -eq 1 ]]; then
238+
docker container prune -f
239+
fi
240+
241+
- name: Runner diskspace health check
242+
uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
243+
if: always()

0 commit comments

Comments
 (0)