|  | 
|  | 1 | +name: Build RCCL | 
|  | 2 | + | 
|  | 3 | +on: | 
|  | 4 | +  push: | 
|  | 5 | +    branches: | 
|  | 6 | +      - main | 
|  | 7 | +  pull_request: | 
|  | 8 | + | 
|  | 9 | +jobs: | 
|  | 10 | +  build: | 
|  | 11 | +    strategy: | 
|  | 12 | +      fail-fast: false | 
|  | 13 | +      matrix: | 
|  | 14 | +        include: | 
|  | 15 | +          - name: RCCL | 
|  | 16 | +            permissions: | 
|  | 17 | +              id-token: write | 
|  | 18 | +              contents: read | 
|  | 19 | +            runs-on: amd-mi350-runner | 
|  | 20 | +            gpu-arch-type: "rocm" | 
|  | 21 | +            gpu-arch-version: "7.0" | 
|  | 22 | +            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0' | 
|  | 23 | +            docker-image: pytorch/manylinux2_28-builder:rocm7.0 | 
|  | 24 | + | 
|  | 25 | +    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | 
|  | 26 | +    with: | 
|  | 27 | +      timeout: 180 | 
|  | 28 | +      runner: ${{ matrix.runs-on }} | 
|  | 29 | +      gpu-arch-type: ${{ matrix.gpu-arch-type }} | 
|  | 30 | +      gpu-arch-version: ${{ matrix.gpu-arch-version }} | 
|  | 31 | +      script: | | 
|  | 32 | +        set -ex | 
|  | 33 | +        # use faster libmamba solver | 
|  | 34 | +        conda config --set solver libmamba | 
|  | 35 | +
 | 
|  | 36 | +        # TODO: remove dependency on fbwhoami | 
|  | 37 | +        echo "DEVICE_NAME=$(hostname)" > /etc/fbwhoami | 
|  | 38 | +        cat /etc/fbwhoami | 
|  | 39 | +
 | 
|  | 40 | +        conda create -n venv python=${{ matrix.python-version }} -y | 
|  | 41 | +        conda activate venv | 
|  | 42 | +        python -m pip install --upgrade pip | 
|  | 43 | +        conda install conda-forge::libopenssl-static conda-forge::rsync -y | 
|  | 44 | +
 | 
|  | 45 | +        pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4 | 
|  | 46 | +        export BUILD_RCCL_ONLY=1 | 
|  | 47 | +        export ROCM_HOME=/opt/rocm | 
|  | 48 | +        export RCCL_INCLUDE=$ROCM_HOME/include/rccl | 
|  | 49 | +        export USE_SYSTEM_LIBS=1 | 
|  | 50 | +        export USE_RCCL=1 | 
|  | 51 | +        ./build_rccl.sh | 
|  | 52 | +        pip install numpy | 
|  | 53 | +        USE_NCCL=0 USE_NCCLX=0 USE_GLOO=0 USE_RCCL=1 pip install -v . | 
|  | 54 | +
 | 
|  | 55 | +        # Verify installation | 
|  | 56 | +        python -c "import torch; print(f'PyTorch version: {torch.__version__}')" | 
|  | 57 | +        python -c "import torch; print(f'HIP available: {torch.cuda.is_available()}')" | 
|  | 58 | +        python -c "import torchcomms; print('TorchComms imported successfully')" | 
|  | 59 | +
 | 
|  | 60 | +        # Test RCCL backend availability | 
|  | 61 | +        python -c " | 
|  | 62 | +        import torchcomms | 
|  | 63 | +        try: | 
|  | 64 | +            comm = torchcomms.new_comm('rccl', torch.device('hip'), 'test_comm') | 
|  | 65 | +            print('RCCL backend available') | 
|  | 66 | +        except Exception as e: | 
|  | 67 | +            print(f'RCCL backend test failed: {e}') | 
|  | 68 | +        " | 
|  | 69 | +
 | 
|  | 70 | +        # Run integration tests | 
|  | 71 | +        echo "Running RCCL integration tests..." | 
|  | 72 | +        comms/torchcomms/scripts/run_tests_integration_rccl_py.sh | 
0 commit comments