ci: switch to custom docker images (#2123)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Lightning-AI · Oct 2, 2023 · 886c09b · 886c09b
1 parent 39ba91a
commit 886c09b
Show file tree

Hide file tree

Showing 16 changed files with 243 additions and 88 deletions.
diff --git a/.azure/gpu-integrations.yml b/.azure/gpu-integrations.yml
@@ -65,22 +65,22 @@ jobs:
           set -e
           pip install -q packaging fire requests wget
           python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
-          python adjust-torch-versions.py requirements.txt $(torch-ver)
+          python adjust-torch-versions.py requirements/base.txt $(torch-ver)
           python adjust-torch-versions.py requirements/integrate.txt $(torch-ver)
+          # FixMe: this shall not be for all integrations/cases
           python .github/assistant.py set-oldest-versions --req_files='["requirements/integrate.txt"]'
           cat requirements/integrate.txt
         displayName: "Adjust versions"
 
       - bash: |
-          set -ex
           pip install -q -r requirements/integrate.txt
           # force reinstall TM as it could be overwritten by integration's dependencies
           pip install . -U -r requirements/test.txt --find-links ${TORCH_URL}
-          pip list
         displayName: "Install package & integrations"
 
       - bash: |
           set -e
+          pip list
           python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '$(torch-ver)', f'PyTorch: {ver}'"
           python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'found GPUs: {mgpu}'"
         displayName: "Sanity check"

diff --git a/.azure/gpu-unittests.yml b/.azure/gpu-unittests.yml
@@ -19,17 +19,17 @@ jobs:
       matrix:
         "PyTorch | old":
           # Torch does not have build wheels with old Torch versions for newer CUDA
-          docker-image: "nvidia/cuda:11.1.1-cudnn8-devel-ubuntu20.04"
+          docker-image: "pytorchlightning/torchmetrics:ubuntu20.04-cuda11.1.1-py3.8-torch1.8.1"
           agent-pool: "lit-rtx-3090"
           torch-ver: "1.8.1"
         "PyTorch | 1.X":
-          docker-image: "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
+          docker-image: "pytorchlightning/torchmetrics:ubuntu22.04-cuda11.8.0-py3.9-torch1.13"
           agent-pool: "lit-rtx-3090"
           torch-ver: "1.13.1"
         "PyTorch | 2.X":
-          docker-image: "pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime"
+          docker-image: "pytorchlightning/torchmetrics:ubuntu22.04-cuda11.8.0-py3.10-torch2.0"
           agent-pool: "lit-rtx-3090"
-          torch-ver: "2.0.0"
+          torch-ver: "2.0.1"
     # how long to run the job before automatically cancelling
     timeoutInMinutes: "120"
     # how much time to give 'run always even if cancelled tasks' before stopping them
@@ -51,37 +51,12 @@ jobs:
 
     container:
       image: "$(docker-image)"
-      options: "--gpus=all --shm-size=8g -v /usr/bin/docker:/tmp/docker:ro  -v /var/tmp:/var/tmp"
+      options: "--gpus=all --shm-size=8g -v /var/tmp:/var/tmp"
 
     workspace:
       clean: all
 
     steps:
-      - script: |
-          set -ex
-          container_id=$(head -1 /proc/self/cgroup|cut -d/ -f3)
-          echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
-          /tmp/docker exec -t -u 0 $container_id \
-            sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo"
-          echo "##vso[task.setvariable variable=CONTAINER_ID]$container_id"
-        displayName: "Install Sudo in container (thanks Microsoft!)"
-
-      - script: |
-          sudo apt-get update -q --fix-missing
-          sudo apt-get install -q -y --no-install-recommends \
-            build-essential \
-            wget \
-            python${PYTHON_VERSION} \
-            python${PYTHON_VERSION}-dev \
-            python${PYTHON_VERSION}-distutils
-          sudo update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1
-          wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate
-          python get-pip.py
-        env:
-          PYTHON_VERSION: "3.8"
-        condition: startsWith(variables['docker-image'], 'nvidia/cuda:')
-        displayName: "install python & pip"
-
       - bash: |
           echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
           CUDA_version=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p')
@@ -96,7 +71,6 @@ jobs:
           whereis nvidia
           nvidia-smi
           echo $CUDA_VISIBLE_DEVICES
-          echo $CONTAINER_ID
           echo $TORCH_URL
           python --version
           pip --version
@@ -105,29 +79,22 @@ jobs:
         displayName: "Image info & NVIDIA"
 
       - bash: |
-          pip install -q packaging wget
-          python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
+          pip install -q packaging
+          wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
           python adjust-torch-versions.py requirements.txt $(torch-ver)
           for fpath in `ls requirements/*.txt`; do
               python adjust-torch-versions.py $fpath $(torch-ver)
           done
+        # FixMe: missing setting minumal configurations for testing
         displayName: "Adjust versions"
 
       - bash: |
-          set -ex
-          sudo apt-get update -qq --fix-missing
-          sudo apt-get install -y --no-install-recommends \
-            build-essential gcc g++ cmake ffmpeg git libsndfile1 unzip
-          # pip install pip -U
-          pip install -q "numpy<1.24"  # trying to resolve pesq installation
-          pip install . -U -r ./requirements/devel.txt \
-            --prefer-binary --find-links=${TORCH_URL}
-          pip install mkl-service==2.4.0  # needed for the gpu multiprocessing
-          pip list
+          pip install . -U -r ./requirements/devel.txt --prefer-binary --find-links=${TORCH_URL}
         displayName: "Install environment"
 
       - bash: |
           set -e
+          pip list
           python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '$(torch-ver)', f'PyTorch: {ver}'"
           python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'found GPUs: {mgpu}'"
         displayName: "Sanity check"
@@ -149,8 +116,7 @@ jobs:
         displayName: "DocTesting"
 
       - bash: |
-          # wget is simpler but does not work on Windows
-          python -c "from urllib.request import urlretrieve ; urlretrieve('https://pl-public-data.s3.amazonaws.com/metrics/data.zip', 'data.zip')"
+          wget https://pl-public-data.s3.amazonaws.com/metrics/data.zip
           unzip -o data.zip
           ls -l _data/*
         workingDirectory: tests

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -11,7 +11,6 @@ RUN if [ "${NODE_VERSION}" != "none" ]; then \
     fi
 
 COPY requirements/ /tmp/pip-tmp/requirements/
-COPY requirements.txt /tmp/pip-tmp/
 RUN \
     pip3 install awscli && \
     aws s3 sync --no-sign-request s3://sphinx-packages/ dist/  && \
@@ -23,8 +22,7 @@ RUN \
     rm -rf /tmp/pip-tmp
 
 # [Optional] If your pip requirements rarely change, uncomment this section to add them to the image.
-# COPY requirements.txt /tmp/pip-tmp/
-# RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
+# RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements/base.txt \
 #    && rm -rf /tmp/pip-tmp
 
 # [Optional] Uncomment this section to install additional OS packages.

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -2,7 +2,7 @@
 // https://github.com/microsoft/vscode-dev-containers/tree/v0.194.0/containers/python-3
 {
   "name": "PyTorch Lightning Metrics",
-  "image": "pytorchlightning/metrics-dev",
+  "image": "pytorchlightning/torchmetrics:devcontainer-py3.9",
   // If you want to use a different Python version, uncomment the build object below
   // "build": {
   //     "dockerfile": "Dockerfile",

diff --git a/.github/actions/pull-caches/action.yml b/.github/actions/pull-caches/action.yml
@@ -26,7 +26,7 @@ runs:
       if: inputs.pytorch-version != ''
       run: |
         curl https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py -o adjust-torch-versions.py
-        python adjust-torch-versions.py requirements.txt ${{ inputs.pytorch-version }}
+        python adjust-torch-versions.py requirements/base.txt ${{ inputs.pytorch-version }}
       shell: bash
 
     - name: Set min. dependencies

diff --git a/.github/assistant.py b/.github/assistant.py
@@ -68,10 +68,10 @@ def prune_packages(req_file: str, *pkgs: str) -> None:
             fp.writelines(lines)
 
     @staticmethod
-    def set_min_torch_by_python(fpath: str = "requirements.txt") -> None:
+    def set_min_torch_by_python(fpath: str = "requirements/base.txt") -> None:
         """Set minimal torch version according to Python actual version.
 
-        >>> AssistantCLI.set_min_torch_by_python("../requirements.txt")
+        >>> AssistantCLI.set_min_torch_by_python("../requirements/base.txt")
 
         """
         py_ver = f"{sys.version_info.major}.{sys.version_info.minor}"

diff --git a/.github/workflows/ci-integrate.yml b/.github/workflows/ci-integrate.yml
@@ -66,9 +66,9 @@ jobs:
           curl https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py -o adjust-torch-versions.py
           pip install -r requirements/test.txt -r requirements/integrate.txt \
             --find-links $PYTORCH_URL -f $PYPI_CACHE --upgrade-strategy eager
-          python adjust-torch-versions.py requirements.txt
+          python adjust-torch-versions.py requirements/base.txt
           python adjust-torch-versions.py requirements/image.txt
-          cat requirements.txt
+          cat requirements/base.txt
           pip install -e . --find-links $PYTORCH_URL -f $PYPI_CACHE
           pip list
 

diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -116,7 +116,6 @@ jobs:
         run: |
           curl https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py -o adjust-torch-versions.py
           pip install -q cython  # needed for installing `pycocotools` in latest config
-          python adjust-torch-versions.py requirements.txt
           for fpath in `ls requirements/*.txt`; do
               python adjust-torch-versions.py $fpath
           done

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -1,4 +1,4 @@
-name: "Build & Push Docker"
+name: "Build (& Push) Dockers"
 
 on: # Trigger the workflow on push or pull request, but only for the master branch
   push:
@@ -7,10 +7,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran
     branches: [master]
     paths:
       - "requirements/*"
-      - ".devcontainer/*"
-      - "environment.yml"
       - "requirements.txt"
-      - ".github/workflows/*docker*.yml"
+      - ".devcontainer/*"
+      - "dockers/**"
+      - ".github/workflows/docker-build.yml"
       - "setup.py"
   workflow_dispatch: {}
 
@@ -19,21 +19,21 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/master' }}
 
 env:
-  PUSH_RELEASE: ${{ github.ref == 'refs/heads/master' || github.event_name == 'workflow_dispatch' }}
+  PUSH_DOCKERHUB: ${{ github.ref == 'refs/heads/master' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' }}
 
 jobs:
   build-Devcontainer:
     runs-on: ubuntu-20.04
     strategy:
       fail-fast: false
       matrix:
-        python_version: ["3.9"]
+        python: ["3.9", "3.10"]
     steps:
       - name: Checkout
         uses: actions/checkout@v4
 
       - name: Login to DockerHub
-        if: env.PUSH_RELEASE == 'true' && github.repository_owner == 'Lightning-AI'
+        if: env.PUSH_DOCKERHUB == 'true' && github.repository_owner == 'Lightning-AI'
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
@@ -44,8 +44,45 @@ jobs:
         uses: docker/build-push-action@v5
         with:
           build-args: |
-            VARIANT=${{ matrix.python_version }}
+            VARIANT=${{ matrix.python }}
           file: .devcontainer/Dockerfile
-          push: ${{ env.PUSH_RELEASE }}
-          tags: pytorchlightning/metrics-dev
+          push: ${{ env.PUSH_DOCKERHUB }}
+          tags: "pytorchlightning/torchmetrics:devcontainer-py${{ matrix.python }}"
         timeout-minutes: 50
+
+  build-cuda:
+    if: github.event.pull_request.draft == false
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          # These are the base images for PL release docker images,
+          # so include at least all of the combinations in release-dockers.yml.
+          - { python: "3.8", pytorch: "1.8.1", cuda: "11.1.1", ubuntu: "20.04" }
+          - { python: "3.9", pytorch: "1.10", cuda: "11.8.0", ubuntu: "22.04" }
+          - { python: "3.9", pytorch: "1.11", cuda: "11.8.0", ubuntu: "22.04" }
+          - { python: "3.9", pytorch: "1.13", cuda: "11.8.0", ubuntu: "22.04" }
+          - { python: "3.10", pytorch: "2.0", cuda: "11.8.0", ubuntu: "22.04" }
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        if: env.PUSH_DOCKERHUB == 'true' && github.repository_owner == 'Lightning-AI'
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Build (and Push) Devcontainer
+        uses: docker/build-push-action@v5
+        with:
+          build-args: |
+            UBUNTU_VERSION=${{ matrix.ubuntu }}
+            PYTHON_VERSION=${{ matrix.python }}
+            PYTORCH_VERSION=${{ matrix.pytorch }}
+            CUDA_VERSION=${{ matrix.cuda }}
+          file: dockers/ubuntu-cuda/Dockerfile
+          push: ${{ env.PUSH_DOCKERHUB }}
+          tags: "pytorchlightning/torchmetrics:ubuntu${{ matrix.ubuntu }}-cuda${{ matrix.cuda }}-py${{ matrix.python }}-torch${{ matrix.pytorch }}"
+        timeout-minutes: 55
diff --git a/dockers/README.md b/dockers/README.md
@@ -0,0 +1,54 @@
+# Docker images
+
+## Build images from Dockerfiles
+
+You can build it on your own, note it takes lots of time, be prepared.
+
+```bash
+git clone https://github.com/Lightning-AI/torchmetrics.git
+
+# build with the default arguments
+docker image build -t torchmetrics:latest -f dockers/ubuntu-cuda/Dockerfile .
+
+# build with specific arguments
+docker image build -t torchmetrics:ubuntu-cuda11.7.1-py3.9-torch1.13 \
+  -f dockers/base-cuda/Dockerfile \
+  --build-arg PYTHON_VERSION=3.9 \
+  --build-arg PYTORCH_VERSION=1.13 \
+  --build-arg CUDA_VERSION=11.7.1 \
+  .
+```
+
+To run your docker use
+
+```bash
+docker image list
+docker run --rm -it torchmetrics:latest bash
+```
+
+and if you do not need it anymore, just clean it:
+
+```bash
+docker image list
+docker image rm torchmetrics:latest
+```
+
+## Run docker image with GPUs
+
+To run docker image with access to your GPUs, you need to install
+
+```bash
+# Add the package repositories
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+
+sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
+sudo systemctl restart docker
+```
+
+and later run the docker image with `--gpus all`. For example,
+
+```bash
+docker run --rm -it --gpus all torchmetrics:ubuntu-cuda11.7.1-py3.9-torch1.12
+```