.github/workflows/pythonapp-gpu.yml

# Jenkinsfile.monai-premerge
name: premerge-gpu

on:
  # quick tests for pull requests and the releasing branches
  push:
    branches:
      - main
      - releasing/*
  pull_request:
    types: [opened, synchronize, closed]

concurrency:
  # automatically cancel the previously triggered workflows when there's a newer version
  group: build-gpu-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

jobs:
  GPU-quick-py3:  # GPU with full dependencies
    # if: ${{ github.repository == 'Project-MONAI/MONAI' && github.event.pull_request.merged != true }}
    if: ${{ false }}  # disable self-hosted job project-monai/monai#7039
    strategy:
      matrix:
        environment:
          - "PT19+CUDA114DOCKER"
          - "PT110+CUDA111"
          - "PT112+CUDA118DOCKER"
          - "PT113+CUDA116"
          - "PT210+CUDA121DOCKER"
        include:
          # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes
          - environment: PT19+CUDA114DOCKER
            # 21.10: 1.10.0a0+0aef44c
            pytorch: "-h"  # we explicitly set pytorch to -h to avoid pip install error
            base: "nvcr.io/nvidia/pytorch:21.10-py3"
          - environment: PT110+CUDA111
            pytorch: "torch==1.10.2 torchvision==0.11.3 --extra-index-url https://download.pytorch.org/whl/cu111"
            base: "nvcr.io/nvidia/cuda:11.1.1-devel-ubuntu18.04"
          - environment: PT112+CUDA118DOCKER
            # 22.09: 1.13.0a0+d0d6b1f
            pytorch: "-h"  # we explicitly set pytorch to -h to avoid pip install error
            base: "nvcr.io/nvidia/pytorch:22.09-py3"
          - environment: PT113+CUDA116
            pytorch: "torch==1.13.1 torchvision==0.14.1"
            base: "nvcr.io/nvidia/cuda:11.6.1-devel-ubuntu18.04"
          - environment: PT210+CUDA121DOCKER
            # 23.08: 2.1.0a0+29c30b1
            pytorch: "-h"  # we explicitly set pytorch to -h to avoid pip install error
            base: "nvcr.io/nvidia/pytorch:23.08-py3"
    container:
      image: ${{ matrix.base }}
      options: --gpus all --env NVIDIA_DISABLE_REQUIRE=true  # workaround for unsatisfied condition: cuda>=11.6
    runs-on: [self-hosted, linux, x64, common]
    steps:
    - uses: actions/checkout@v4
    - name: apt install
      if: github.event.pull_request.merged != true
      run: |
        apt-get update
        apt-get install -y wget

        if [ ${{ matrix.environment }} = "PT110+CUDA111" ] || \
          [ ${{ matrix.environment }} = "PT113+CUDA116" ]
        then
        PYVER=3.8 PYSFX=3 DISTUTILS=python3-distutils && \
        apt-get update && apt-get install -y --no-install-recommends \
          curl \
          pkg-config \
          python$PYVER \
          python$PYVER-dev \
          python$PYSFX-pip \
          $DISTUTILS \
          rsync \
          swig \
          unzip \
          zip \
          zlib1g-dev \
          libboost-locale-dev \
          libboost-program-options-dev \
          libboost-system-dev \
          libboost-thread-dev \
          libboost-test-dev \
          libgoogle-glog-dev \
          libjsoncpp-dev \
          cmake \
          git && \
        rm -rf /var/lib/apt/lists/* && \
        export PYTHONIOENCODING=utf-8 LC_ALL=C.UTF-8 && \
        rm -f /usr/bin/python && \
        rm -f /usr/bin/python`echo $PYVER | cut -c1-1` && \
        ln -s /usr/bin/python$PYVER /usr/bin/python && \
        ln -s /usr/bin/python$PYVER /usr/bin/python`echo $PYVER | cut -c1-1` &&
        curl -O https://bootstrap.pypa.io/get-pip.py && \
        python get-pip.py && \
        rm get-pip.py;
        fi
    - if: matrix.environment == 'PT19+CUDA114DOCKER'
      name: Optional Cupy dependency (cuda114)
      run: echo "cupy-cuda114" >> requirements-dev.txt
    - name: Install dependencies
      if: github.event.pull_request.merged != true
      run: |
        which python
        python -m pip install --upgrade pip wheel
        # fixes preinstalled ruamel_yaml error from the docker image
        rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/ruamel*
        rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/llvmlite*  #6377
        python -m pip install ${{ matrix.pytorch }}
        python -m pip install -r requirements-dev.txt
        python -m pip list
    - name: Run quick tests (GPU)
      if: github.event.pull_request.merged != true
      run: |
        git clone --depth 1 \
          https://github.com/Project-MONAI/MONAI-extra-test-data.git /MONAI-extra-test-data
        export MONAI_EXTRA_TEST_DATA="/MONAI-extra-test-data"
        nvidia-smi
        export LAUNCH_DELAY=$(python -c "import numpy; print(numpy.random.randint(30) * 10)")
        echo "Sleep $LAUNCH_DELAY"
        sleep $LAUNCH_DELAY
        export CUDA_VISIBLE_DEVICES=$(coverage run -m tests.utils | tail -n 1)
        echo $CUDA_VISIBLE_DEVICES
        trap 'if pgrep python; then pkill python; fi;' ERR
        python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
        python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
        python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
        python -c "import monai; monai.config.print_config()"
        # build for the current self-hosted CI Tesla V100
        BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --build --disttests
        ./runtests.sh --quick --unittests
        if [ ${{ matrix.environment }} = "PT113+CUDA116" ]; then
          # test the clang-format tool downloading once
          coverage run -m tests.clang_format_utils
        fi
        coverage xml --ignore-errors
        if pgrep python; then pkill python; fi
      shell: bash
    - name: Upload coverage
      if: ${{ github.head_ref != 'dev' && github.event.pull_request.merged != true }}
      uses: codecov/codecov-action@v4
      with:
        files: ./coverage.xml