Skip to content

Commit 93524f4

Browse files
dmwufacebook-github-bot
authored andcommitted
Add build_test file for RCCL (#23)
Summary: As in title. Also delete the missed setup_rccl.sh copy in github (previous diff) Differential Revision: D84125652 Pulled By: sudharssun
1 parent f9a45be commit 93524f4

File tree

2 files changed

+73
-1
lines changed

2 files changed

+73
-1
lines changed
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: Build RCCL
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
pull_request:
8+
9+
permissions:
10+
id-token: write
11+
contents: read
12+
13+
jobs:
14+
build:
15+
strategy:
16+
fail-fast: false
17+
matrix:
18+
include:
19+
- name: RCCL
20+
runs-on: amd-mi350-runner
21+
gpu-arch-type: "rocm"
22+
gpu-arch-version: "7.0"
23+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0'
24+
docker-image: pytorch/manylinux2_28-builder:rocm7.0
25+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
26+
with:
27+
timeout: 180
28+
runner: ${{ matrix.runs-on }}
29+
gpu-arch-type: ${{ matrix.gpu-arch-type }}
30+
gpu-arch-version: ${{ matrix.gpu-arch-version }}
31+
script: |
32+
set -ex
33+
# use faster libmamba solver
34+
conda config --set solver libmamba
35+
36+
# TODO: remove dependency on fbwhoami
37+
echo "DEVICE_NAME=$(hostname)" > /etc/fbwhoami
38+
cat /etc/fbwhoami
39+
40+
conda create -n venv python=${{ matrix.python-version }} -y
41+
conda activate venv
42+
python -m pip install --upgrade pip
43+
conda install conda-forge::libopenssl-static conda-forge::rsync -y
44+
45+
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4
46+
export BUILD_RCCL_ONLY=1
47+
export ROCM_HOME=/opt/rocm
48+
export RCCL_INCLUDE=$ROCM_HOME/include/rccl
49+
export USE_SYSTEM_LIBS=1
50+
export USE_RCCL=1
51+
./build_rccl.sh
52+
pip install numpy
53+
USE_NCCL=0 USE_NCCLX=0 USE_GLOO=0 USE_RCCL=1 pip install -v .
54+
55+
# Verify installation
56+
python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
57+
python -c "import torch; print(f'HIP available: {torch.cuda.is_available()}')"
58+
python -c "import torchcomms; print('TorchComms imported successfully')"
59+
60+
# Test RCCL backend availability
61+
python -c "
62+
import torchcomms
63+
try:
64+
comm = torchcomms.new_comm('rccl', torch.device('hip'), 'test_comm')
65+
print('RCCL backend available')
66+
except Exception as e:
67+
print(f'RCCL backend test failed: {e}')
68+
"
69+
70+
# Run integration tests
71+
echo "Running RCCL integration tests..."
72+
comms/torchcomms/scripts/run_tests_integration_rccl_py.sh

build_rccl.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,5 +75,5 @@ else
7575
export RCCL_HOME=$ROCM_HOME/lib
7676
fi
7777

78-
popd || exit 1
78+
popd || true
7979
pip install numpy

0 commit comments

Comments
 (0)