Add build_test file for RCCL (#18)

dmwu · facebook-github-bot · commit 5e0edcd9155e · 2025-10-27T12:35:07.000-07:00
Summary:


As in title.

Also delete the missed setup_rccl.sh copy in github (previous diff)


Differential Revision: D84125652

Pulled By: sudharssun
diff --git a/.github/workflows/build_test_rccl.yaml b/.github/workflows/build_test_rccl.yaml
@@ -0,0 +1,72 @@
+name: Build RCCL
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: RCCL
+            permissions:
+              id-token: write
+              contents: read
+            runs-on: amd-mi350-runner
+            gpu-arch-type: "rocm"
+            gpu-arch-version: "7.0"
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0'
+            docker-image: pytorch/manylinux2_28-builder:rocm7.0
+
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 180
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      script: |
+        set -ex
+        # use faster libmamba solver
+        conda config --set solver libmamba
+
+        # TODO: remove dependency on fbwhoami
+        echo "DEVICE_NAME=$(hostname)" > /etc/fbwhoami
+        cat /etc/fbwhoami
+
+        conda create -n venv python=${{ matrix.python-version }} -y
+        conda activate venv
+        python -m pip install --upgrade pip
+        conda install conda-forge::libopenssl-static conda-forge::rsync -y
+
+        pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4
+        export BUILD_RCCL_ONLY=1
+        export ROCM_HOME=/opt/rocm
+        export RCCL_INCLUDE=$ROCM_HOME/include/rccl
+        export USE_SYSTEM_LIBS=1
+        export USE_RCCL=1
+        ./build_rccl.sh
+        pip install numpy
+        USE_NCCL=0 USE_NCCLX=0 USE_GLOO=0 USE_RCCL=1 pip install -v .
+
+        # Verify installation
+        python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
+        python -c "import torch; print(f'HIP available: {torch.cuda.is_available()}')"
+        python -c "import torchcomms; print('TorchComms imported successfully')"
+
+        # Test RCCL backend availability
+        python -c "
+        import torchcomms
+        try:
+            comm = torchcomms.new_comm('rccl', torch.device('hip'), 'test_comm')
+            print('RCCL backend available')
+        except Exception as e:
+            print(f'RCCL backend test failed: {e}')
+        "
+
+        # Run integration tests
+        echo "Running RCCL integration tests..."
+        comms/torchcomms/scripts/run_tests_integration_rccl_py.sh