Skip to content

Commit 5e84173

Browse files
dmwufacebook-github-bot
authored andcommitted
Add build_test file for RCCL (#23)
Summary: As in title. Also delete the missed setup_rccl.sh copy in github (previous diff) Differential Revision: D84125652 Pulled By: sudharssun
1 parent 5c22a0a commit 5e84173

File tree

2 files changed

+84
-1
lines changed

2 files changed

+84
-1
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
name: Build RCCL
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
pull_request:
8+
9+
permissions:
10+
id-token: write
11+
contents: read
12+
13+
jobs:
14+
build:
15+
strategy:
16+
fail-fast: false
17+
matrix:
18+
include:
19+
- name: RCCL
20+
runs-on: amd-mi350-runner
21+
gpu-arch-type: "rocm"
22+
gpu-arch-version: "7.0"
23+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0'
24+
docker-image: pytorch/manylinux2_28-builder:rocm7.0
25+
cmake-version: "latest"
26+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
27+
with:
28+
timeout: 180
29+
runner: ${{ matrix.runs-on }}
30+
gpu-arch-type: ${{ matrix.gpu-arch-type }}
31+
gpu-arch-version: ${{ matrix.gpu-arch-version }}
32+
script: |
33+
set -ex
34+
# use faster libmamba solver
35+
conda config --set solver libmamba
36+
37+
# TODO: remove dependency on fbwhoami
38+
echo "DEVICE_NAME=$(hostname)" > /etc/fbwhoami
39+
cat /etc/fbwhoami
40+
41+
conda create -n venv python=${{ matrix.python-version }} -y
42+
conda activate venv
43+
python -m pip install --upgrade pip
44+
conda install conda-forge::libopenssl-static conda-forge::rsync -y
45+
conda install conda-forge::glog=0.4.0 conda-forge::gflags conda-forge::fmt -y
46+
47+
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0
48+
49+
if [ "${{ matrix.cmake-version }}" = "latest" ]; then
50+
conda install -y cmake
51+
else # default to latest
52+
conda install -y cmake==${{ matrix.cmake-version }}
53+
fi
54+
55+
pip install -r requirements.txt
56+
57+
export BUILD_RCCL_ONLY=1
58+
export ROCM_HOME=/opt/rocm
59+
export RCCL_INCLUDE=$ROCM_HOME/include/rccl
60+
export USE_SYSTEM_LIBS=1
61+
export USE_RCCL=1
62+
./build_rccl.sh
63+
pip install numpy
64+
USE_NCCL=0 USE_NCCLX=0 USE_GLOO=0 USE_RCCL=1 pip install --no-build-isolation -v -e .
65+
66+
# Verify installation
67+
python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
68+
python -c "import torch; print(f'HIP available: {torch.cuda.is_available()}')"
69+
python -c "import torchcomms; print('TorchComms imported successfully')"
70+
71+
# Test RCCL backend availability
72+
python -c "
73+
import torchcomms
74+
try:
75+
comm = torchcomms.new_comm('rccl', torch.device('hip'), 'test_comm')
76+
print('RCCL backend available')
77+
except Exception as e:
78+
print(f'RCCL backend test failed: {e}')
79+
"
80+
81+
# Run integration tests
82+
echo "Running RCCL integration tests..."
83+
comms/torchcomms/scripts/run_tests_integration_rccl_py.sh

build_rccl.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,5 +75,5 @@ else
7575
export RCCL_HOME=$ROCM_HOME/lib
7676
fi
7777

78-
popd || exit 1
78+
popd || true
7979
pip install numpy

0 commit comments

Comments
 (0)