Skip to content

Commit b7f090d

Browse files
dmwufacebook-github-bot
authored andcommitted
Add build_test file for RCCL (#18)
Summary: As in title. Also delete the missed setup_rccl.sh copy in github (previous diff) Differential Revision: D84125652 Pulled By: sudharssun
1 parent ea81c2e commit b7f090d

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
name: Build RCCL
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
pull_request:
8+
9+
permissions:
10+
id-token: write
11+
contents: read
12+
13+
jobs:
14+
build:
15+
strategy:
16+
fail-fast: false
17+
matrix:
18+
include:
19+
- name: RCCL
20+
permissions:
21+
id-token: write
22+
contents: read
23+
runs-on: amd-mi350-runner
24+
gpu-arch-type: "rocm"
25+
gpu-arch-version: "7.0"
26+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0'
27+
docker-image: pytorch/manylinux2_28-builder:rocm7.0
28+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
29+
with:
30+
timeout: 180
31+
runner: ${{ matrix.runs-on }}
32+
gpu-arch-type: ${{ matrix.gpu-arch-type }}
33+
gpu-arch-version: ${{ matrix.gpu-arch-version }}
34+
script: |
35+
set -ex
36+
# use faster libmamba solver
37+
conda config --set solver libmamba
38+
39+
# TODO: remove dependency on fbwhoami
40+
echo "DEVICE_NAME=$(hostname)" > /etc/fbwhoami
41+
cat /etc/fbwhoami
42+
43+
conda create -n venv python=${{ matrix.python-version }} -y
44+
conda activate venv
45+
python -m pip install --upgrade pip
46+
conda install conda-forge::libopenssl-static conda-forge::rsync -y
47+
48+
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4
49+
export BUILD_RCCL_ONLY=1
50+
export ROCM_HOME=/opt/rocm
51+
export RCCL_INCLUDE=$ROCM_HOME/include/rccl
52+
export USE_SYSTEM_LIBS=1
53+
export USE_RCCL=1
54+
./build_rccl.sh
55+
pip install numpy
56+
USE_NCCL=0 USE_NCCLX=0 USE_GLOO=0 USE_RCCL=1 pip install -v .
57+
58+
# Verify installation
59+
python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
60+
python -c "import torch; print(f'HIP available: {torch.cuda.is_available()}')"
61+
python -c "import torchcomms; print('TorchComms imported successfully')"
62+
63+
# Test RCCL backend availability
64+
python -c "
65+
import torchcomms
66+
try:
67+
comm = torchcomms.new_comm('rccl', torch.device('hip'), 'test_comm')
68+
print('RCCL backend available')
69+
except Exception as e:
70+
print(f'RCCL backend test failed: {e}')
71+
"
72+
73+
# Run integration tests
74+
echo "Running RCCL integration tests..."
75+
comms/torchcomms/scripts/run_tests_integration_rccl_py.sh

0 commit comments

Comments
 (0)