-
Notifications
You must be signed in to change notification settings - Fork 3.4k
177 lines (152 loc) · 6.1 KB
/
tpu-tests.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
name: Test PyTorch - TPU
on:
push:
branches: [master, "release/*"]
pull_request_target:
branches: [master, "release/*"]
types: [opened, reopened, edited, ready_for_review, synchronize]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
defaults:
run:
shell: bash
jobs:
test-on-tpus:
runs-on: ubuntu-22.04
# run only when the PR title contains 'TPU' or is a merge to master
if: |
(github.event_name == 'push' && github.ref == 'refs/heads/master') ||
(startsWith(github.event_name, 'pull_request') && contains(github.event.pull_request.title, 'TPU'))
strategy:
fail-fast: false
matrix:
pkg-name: ["fabric", "pytorch"]
runtime: ["xrt", "pjrt"]
accelerator_type: ["v4-8"]
timeout-minutes: 30
env:
XLA_VER: "2.0"
PR_NUMBER: ${{ github.event.pull_request.number && github.event.pull_request.number || 'master' }}
SHA: ${{ github.event.pull_request.head.sha && github.event.pull_request.head.sha || github.sha }}
CLOUDSDK_CORE_DISABLE_PROMPTS: 1 # default to --quiet
steps:
- name: Set env
run: |
# define --zone: https://cloud.google.com/tpu/docs/regions-zones
if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then
echo "CLOUDSDK_COMPUTE_ZONE=us-central2-b" >> $GITHUB_ENV
else
echo "CLOUDSDK_COMPUTE_ZONE=us-west4-a" >> $GITHUB_ENV
fi
- uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha }}
- uses: actions/setup-python@v4
with:
python-version: "3.10"
- uses: google-github-actions/auth@v1
with:
credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }}
- uses: "google-github-actions/setup-gcloud@v1"
- name: Time-based job cleanup
if: always()
run: |
gcloud compute tpus tpu-vm list --format='value(name,createTime)' > creation_times.txt
cat creation_times.txt
if [ ! -s "creation_times.txt" ]; then
echo "No existing jobs"
exit 0
fi
jobs_deleted=false
while read -r job_name created_at; do
# Skip jobs with "keepalive" in the name
if [[ "$job_name" == *"keepalive"* ]]; then
echo "Skipping $job_name, has keepalive in name"
continue
fi
# Convert the creation time to Unix timestamp
created_timestamp=$(date -d "${created_at}" +%s)
# Calculate the difference between the current time and the creation time
current_timestamp=$(date +%s)
age=$((current_timestamp - created_timestamp))
# Check if the age has surpassed a timeout
if ((age > 35 * 60)); then
# delete the job
gcloud compute tpus tpu-vm delete "$job_name" --async
jobs_deleted=true
else
echo "Skipping $job_name, alive for $age seconds"
fi
done < creation_times.txt
if [ "$jobs_deleted" = true ]; then
sleep 5
# diagnostics
gcloud compute tpus tpu-vm list
fi
- name: Update script
run: |
import os
fname = f'tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh'
with open(fname) as fopen:
data = fopen.read()
data = data.replace('{PYTORCH_VERSION}', os.environ["XLA_VER"])
data = data.replace('{RUNTIME}', '${{ matrix.runtime }}')
print(data)
with open(fname, "w") as fopen:
fopen.write(data)
shell: python
- name: Create node
id: tpu-create
# TPU capacity is very limited so this workflow's success is optional. continue normally if creation fails
continue-on-error: true
env:
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.runtime }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
run: |
if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then
gcloud compute tpus tpu-vm create "$JOB_NAME" \
--accelerator-type=${{ matrix.accelerator_type }} \
--version="tpu-vm-v4-pt-$XLA_VER" \
--preemptible
fi
- name: Run tests
if: steps.tpu-create.outcome == 'success'
env:
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.runtime }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
run: |
set -uex
# zip-copy-unzip the repository
zip -q -r repo.zip . -x .git/
gcloud compute tpus tpu-vm scp --worker=all repo.zip "$JOB_NAME":~
gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; unzip -q -o repo.zip"
# run script
gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; bash tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh"
exit_code=$?
# pull out the coverage file
gcloud compute tpus tpu-vm scp "$JOB_NAME":~/coverage.xml .
exit $exit_code
- name: Cleanup job
if: always()
env:
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.runtime }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
run: |
if ! gcloud compute tpus tpu-vm list | grep -q "$JOB_NAME"; then
echo "$JOB_NAME wasn't created"
exit 0
fi
# diagnostics
gcloud compute tpus tpu-vm describe "$JOB_NAME"
# delete the job
gcloud compute tpus tpu-vm delete "$JOB_NAME" --async
sleep 5
# diagnostics
gcloud compute tpus tpu-vm list
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
continue-on-error: true
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: coverage.xml
flags: tpu,pytest
name: TPU-coverage
fail_ci_if_error: false