Skip to content

Commit d2912f5

Browse files
committed
Add rocm perf yml file
1 parent 3c4fd69 commit d2912f5

File tree

1 file changed

+128
-0
lines changed

1 file changed

+128
-0
lines changed

.github/workflows/rocm-perf.yml

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
name: ROCm DLM Performance Evaluations
2+
3+
on:
4+
push:
5+
6+
jobs:
7+
build-and-test-jax:
8+
runs-on: mi-250
9+
strategy:
10+
matrix:
11+
python: ["3.10"]
12+
rocm: ["6.3.4"]
13+
14+
env:
15+
BASE_IMAGE: "ubuntu:22.04"
16+
PYTHON_VERSION: ${{ matrix.python }}
17+
ROCM_VERSION: ${{ matrix.rocm }}
18+
TEST_IMAGE: ubuntu-jax-${{ github.run_id }}_${{ github.run_number }}_${{ github.run_attempt }}
19+
WORKSPACE_DIR: jax_rocm_perf_${{ github.run_id }}_${{ github.run_number }}_${{ github.run_attempt }}
20+
21+
steps:
22+
- name: Clean up old workdirs
23+
run: |
24+
docker run -v "$(pwd):/jax" ubuntu bash -c "chown -R $UID /jax/jax_rocm_perf_* || true"
25+
rm -rf jax_rocm_perf_* || true
26+
ls -l
27+
28+
- name: Print system info
29+
run: |
30+
whoami
31+
printenv
32+
df -h
33+
rocm-smi || true
34+
35+
- name: Checkout JAX source
36+
uses: actions/checkout@v4
37+
with:
38+
path: ${{ env.WORKSPACE_DIR }}
39+
40+
- name: Build JAX Docker Image
41+
run: |
42+
cd $WORKSPACE_DIR
43+
python3 build/rocm/ci_build \
44+
--rocm-version "$ROCM_VERSION" \
45+
--base-docker "$BASE_IMAGE" \
46+
--python-versions "$PYTHON_VERSION" \
47+
--compiler=clang \
48+
dist_docker \
49+
--image-tag "$TEST_IMAGE"
50+
51+
- name: Checkout MaxText source
52+
uses: actions/checkout@v4
53+
with:
54+
repository: ROCm/maxtext
55+
ref: rv_jax
56+
path: ${{ env.WORKSPACE_DIR }}/maxtext
57+
58+
- name: Launch container
59+
run: |
60+
docker run -d --name maxtext_container \
61+
--network=host \
62+
--device=/dev/kfd \
63+
--device=/dev/dri \
64+
--ipc=host \
65+
--shm-size=64G \
66+
--group-add=video \
67+
--cap-add=SYS_PTRACE \
68+
--security-opt seccomp=unconfined \
69+
-v "$(pwd)/${{ env.WORKSPACE_DIR }}/maxtext:/maxtext" \
70+
-w /maxtext \
71+
"$TEST_IMAGE" \
72+
tail -f /dev/null
73+
74+
- name: Run MaxText training and save logs
75+
run: |
76+
docker exec maxtext_container bash -c "pip install -r requirements.txt"
77+
for config in \
78+
MaxText/configs/models/gpu/llama2_7b_rocm.yml \
79+
MaxText/configs/models/gpu/gemma_2b_rocm.yml \
80+
MaxText/configs/models/gpu/gpt3_6b_rocm.yml \
81+
MaxText/configs/models/gpu/mixtral_8x1b_rocm.yml; do
82+
model_name=$(basename "$config" _rocm.yml)
83+
echo "Running $model_name"
84+
docker exec maxtext_container bash -c "python3 -m MaxText.train $config" \
85+
| tee logs_${model_name}.log
86+
done
87+
88+
- name: Analyze logs to compute median step time
89+
run: |
90+
pip install numpy
91+
cat << EOF > analyze_logs.py
92+
import json, re, glob
93+
import numpy as np
94+
95+
summary = {}
96+
for log in glob.glob("logs_*.log"):
97+
model = log.replace("logs_", "").replace(".log", "")
98+
times = []
99+
with open(log) as f:
100+
for line in f:
101+
m = re.search(r"completed step: \d+, seconds: ([\d.]+)", line)
102+
if m:
103+
times.append(float(m.group(1)))
104+
if times:
105+
summary[model] = {
106+
"median_step_time": round(float(np.median(times)), 3),
107+
"steps_counted": len(times)
108+
}
109+
110+
with open("summary.json", "w") as f:
111+
json.dump(summary, f, indent=2)
112+
EOF
113+
python3 analyze_logs.py
114+
cat summary.json
115+
116+
- name: Upload logs and summary
117+
uses: actions/upload-artifact@v4
118+
with:
119+
name: training-results
120+
path: |
121+
logs_*.log
122+
summary.json
123+
124+
- name: Cleanup container
125+
if: always()
126+
run: |
127+
docker stop maxtext_container || true
128+
docker rm maxtext_container || true

0 commit comments

Comments
 (0)