Skip to content

Commit

Permalink
ci: switch to custom docker images (#2123)
Browse files Browse the repository at this point in the history
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
Borda and pre-commit-ci[bot] authored Oct 2, 2023
1 parent 39ba91a commit 886c09b
Show file tree
Hide file tree
Showing 16 changed files with 243 additions and 88 deletions.
6 changes: 3 additions & 3 deletions .azure/gpu-integrations.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,22 +65,22 @@ jobs:
set -e
pip install -q packaging fire requests wget
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
python adjust-torch-versions.py requirements.txt $(torch-ver)
python adjust-torch-versions.py requirements/base.txt $(torch-ver)
python adjust-torch-versions.py requirements/integrate.txt $(torch-ver)
# FixMe: this shall not be for all integrations/cases
python .github/assistant.py set-oldest-versions --req_files='["requirements/integrate.txt"]'
cat requirements/integrate.txt
displayName: "Adjust versions"
- bash: |
set -ex
pip install -q -r requirements/integrate.txt
# force reinstall TM as it could be overwritten by integration's dependencies
pip install . -U -r requirements/test.txt --find-links ${TORCH_URL}
pip list
displayName: "Install package & integrations"
- bash: |
set -e
pip list
python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '$(torch-ver)', f'PyTorch: {ver}'"
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'found GPUs: {mgpu}'"
displayName: "Sanity check"
Expand Down
56 changes: 11 additions & 45 deletions .azure/gpu-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@ jobs:
matrix:
"PyTorch | old":
# Torch does not have build wheels with old Torch versions for newer CUDA
docker-image: "nvidia/cuda:11.1.1-cudnn8-devel-ubuntu20.04"
docker-image: "pytorchlightning/torchmetrics:ubuntu20.04-cuda11.1.1-py3.8-torch1.8.1"
agent-pool: "lit-rtx-3090"
torch-ver: "1.8.1"
"PyTorch | 1.X":
docker-image: "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
docker-image: "pytorchlightning/torchmetrics:ubuntu22.04-cuda11.8.0-py3.9-torch1.13"
agent-pool: "lit-rtx-3090"
torch-ver: "1.13.1"
"PyTorch | 2.X":
docker-image: "pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime"
docker-image: "pytorchlightning/torchmetrics:ubuntu22.04-cuda11.8.0-py3.10-torch2.0"
agent-pool: "lit-rtx-3090"
torch-ver: "2.0.0"
torch-ver: "2.0.1"
# how long to run the job before automatically cancelling
timeoutInMinutes: "120"
# how much time to give 'run always even if cancelled tasks' before stopping them
Expand All @@ -51,37 +51,12 @@ jobs:

container:
image: "$(docker-image)"
options: "--gpus=all --shm-size=8g -v /usr/bin/docker:/tmp/docker:ro -v /var/tmp:/var/tmp"
options: "--gpus=all --shm-size=8g -v /var/tmp:/var/tmp"

workspace:
clean: all

steps:
- script: |
set -ex
container_id=$(head -1 /proc/self/cgroup|cut -d/ -f3)
echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
/tmp/docker exec -t -u 0 $container_id \
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo"
echo "##vso[task.setvariable variable=CONTAINER_ID]$container_id"
displayName: "Install Sudo in container (thanks Microsoft!)"
- script: |
sudo apt-get update -q --fix-missing
sudo apt-get install -q -y --no-install-recommends \
build-essential \
wget \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev \
python${PYTHON_VERSION}-distutils
sudo update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1
wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate
python get-pip.py
env:
PYTHON_VERSION: "3.8"
condition: startsWith(variables['docker-image'], 'nvidia/cuda:')
displayName: "install python & pip"
- bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
CUDA_version=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p')
Expand All @@ -96,7 +71,6 @@ jobs:
whereis nvidia
nvidia-smi
echo $CUDA_VISIBLE_DEVICES
echo $CONTAINER_ID
echo $TORCH_URL
python --version
pip --version
Expand All @@ -105,29 +79,22 @@ jobs:
displayName: "Image info & NVIDIA"
- bash: |
pip install -q packaging wget
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
pip install -q packaging
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
python adjust-torch-versions.py requirements.txt $(torch-ver)
for fpath in `ls requirements/*.txt`; do
python adjust-torch-versions.py $fpath $(torch-ver)
done
# FixMe: missing setting minumal configurations for testing
displayName: "Adjust versions"
- bash: |
set -ex
sudo apt-get update -qq --fix-missing
sudo apt-get install -y --no-install-recommends \
build-essential gcc g++ cmake ffmpeg git libsndfile1 unzip
# pip install pip -U
pip install -q "numpy<1.24" # trying to resolve pesq installation
pip install . -U -r ./requirements/devel.txt \
--prefer-binary --find-links=${TORCH_URL}
pip install mkl-service==2.4.0 # needed for the gpu multiprocessing
pip list
pip install . -U -r ./requirements/devel.txt --prefer-binary --find-links=${TORCH_URL}
displayName: "Install environment"
- bash: |
set -e
pip list
python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '$(torch-ver)', f'PyTorch: {ver}'"
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'found GPUs: {mgpu}'"
displayName: "Sanity check"
Expand All @@ -149,8 +116,7 @@ jobs:
displayName: "DocTesting"

- bash: |
# wget is simpler but does not work on Windows
python -c "from urllib.request import urlretrieve ; urlretrieve('https://pl-public-data.s3.amazonaws.com/metrics/data.zip', 'data.zip')"
wget https://pl-public-data.s3.amazonaws.com/metrics/data.zip
unzip -o data.zip
ls -l _data/*
workingDirectory: tests
Expand Down
4 changes: 1 addition & 3 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ RUN if [ "${NODE_VERSION}" != "none" ]; then \
fi

COPY requirements/ /tmp/pip-tmp/requirements/
COPY requirements.txt /tmp/pip-tmp/
RUN \
pip3 install awscli && \
aws s3 sync --no-sign-request s3://sphinx-packages/ dist/ && \
Expand All @@ -23,8 +22,7 @@ RUN \
rm -rf /tmp/pip-tmp

# [Optional] If your pip requirements rarely change, uncomment this section to add them to the image.
# COPY requirements.txt /tmp/pip-tmp/
# RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
# RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements/base.txt \
# && rm -rf /tmp/pip-tmp

# [Optional] Uncomment this section to install additional OS packages.
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// https://github.com/microsoft/vscode-dev-containers/tree/v0.194.0/containers/python-3
{
"name": "PyTorch Lightning Metrics",
"image": "pytorchlightning/metrics-dev",
"image": "pytorchlightning/torchmetrics:devcontainer-py3.9",
// If you want to use a different Python version, uncomment the build object below
// "build": {
// "dockerfile": "Dockerfile",
Expand Down
2 changes: 1 addition & 1 deletion .github/actions/pull-caches/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ runs:
if: inputs.pytorch-version != ''
run: |
curl https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py -o adjust-torch-versions.py
python adjust-torch-versions.py requirements.txt ${{ inputs.pytorch-version }}
python adjust-torch-versions.py requirements/base.txt ${{ inputs.pytorch-version }}
shell: bash

- name: Set min. dependencies
Expand Down
4 changes: 2 additions & 2 deletions .github/assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ def prune_packages(req_file: str, *pkgs: str) -> None:
fp.writelines(lines)

@staticmethod
def set_min_torch_by_python(fpath: str = "requirements.txt") -> None:
def set_min_torch_by_python(fpath: str = "requirements/base.txt") -> None:
"""Set minimal torch version according to Python actual version.
>>> AssistantCLI.set_min_torch_by_python("../requirements.txt")
>>> AssistantCLI.set_min_torch_by_python("../requirements/base.txt")
"""
py_ver = f"{sys.version_info.major}.{sys.version_info.minor}"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ci-integrate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ jobs:
curl https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py -o adjust-torch-versions.py
pip install -r requirements/test.txt -r requirements/integrate.txt \
--find-links $PYTORCH_URL -f $PYPI_CACHE --upgrade-strategy eager
python adjust-torch-versions.py requirements.txt
python adjust-torch-versions.py requirements/base.txt
python adjust-torch-versions.py requirements/image.txt
cat requirements.txt
cat requirements/base.txt
pip install -e . --find-links $PYTORCH_URL -f $PYPI_CACHE
pip list
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/ci-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ jobs:
run: |
curl https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py -o adjust-torch-versions.py
pip install -q cython # needed for installing `pycocotools` in latest config
python adjust-torch-versions.py requirements.txt
for fpath in `ls requirements/*.txt`; do
python adjust-torch-versions.py $fpath
done
Expand Down
57 changes: 47 additions & 10 deletions .github/workflows/docker-build.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: "Build & Push Docker"
name: "Build (& Push) Dockers"

on: # Trigger the workflow on push or pull request, but only for the master branch
push:
Expand All @@ -7,10 +7,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran
branches: [master]
paths:
- "requirements/*"
- ".devcontainer/*"
- "environment.yml"
- "requirements.txt"
- ".github/workflows/*docker*.yml"
- ".devcontainer/*"
- "dockers/**"
- ".github/workflows/docker-build.yml"
- "setup.py"
workflow_dispatch: {}

Expand All @@ -19,21 +19,21 @@ concurrency:
cancel-in-progress: ${{ github.ref != 'refs/heads/master' }}

env:
PUSH_RELEASE: ${{ github.ref == 'refs/heads/master' || github.event_name == 'workflow_dispatch' }}
PUSH_DOCKERHUB: ${{ github.ref == 'refs/heads/master' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' }}

jobs:
build-Devcontainer:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
python_version: ["3.9"]
python: ["3.9", "3.10"]
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Login to DockerHub
if: env.PUSH_RELEASE == 'true' && github.repository_owner == 'Lightning-AI'
if: env.PUSH_DOCKERHUB == 'true' && github.repository_owner == 'Lightning-AI'
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
Expand All @@ -44,8 +44,45 @@ jobs:
uses: docker/build-push-action@v5
with:
build-args: |
VARIANT=${{ matrix.python_version }}
VARIANT=${{ matrix.python }}
file: .devcontainer/Dockerfile
push: ${{ env.PUSH_RELEASE }}
tags: pytorchlightning/metrics-dev
push: ${{ env.PUSH_DOCKERHUB }}
tags: "pytorchlightning/torchmetrics:devcontainer-py${{ matrix.python }}"
timeout-minutes: 50

build-cuda:
if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
# These are the base images for PL release docker images,
# so include at least all of the combinations in release-dockers.yml.
- { python: "3.8", pytorch: "1.8.1", cuda: "11.1.1", ubuntu: "20.04" }
- { python: "3.9", pytorch: "1.10", cuda: "11.8.0", ubuntu: "22.04" }
- { python: "3.9", pytorch: "1.11", cuda: "11.8.0", ubuntu: "22.04" }
- { python: "3.9", pytorch: "1.13", cuda: "11.8.0", ubuntu: "22.04" }
- { python: "3.10", pytorch: "2.0", cuda: "11.8.0", ubuntu: "22.04" }
steps:
- uses: actions/checkout@v4

- name: Login to DockerHub
uses: docker/login-action@v3
if: env.PUSH_DOCKERHUB == 'true' && github.repository_owner == 'Lightning-AI'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Build (and Push) Devcontainer
uses: docker/build-push-action@v5
with:
build-args: |
UBUNTU_VERSION=${{ matrix.ubuntu }}
PYTHON_VERSION=${{ matrix.python }}
PYTORCH_VERSION=${{ matrix.pytorch }}
CUDA_VERSION=${{ matrix.cuda }}
file: dockers/ubuntu-cuda/Dockerfile
push: ${{ env.PUSH_DOCKERHUB }}
tags: "pytorchlightning/torchmetrics:ubuntu${{ matrix.ubuntu }}-cuda${{ matrix.cuda }}-py${{ matrix.python }}-torch${{ matrix.pytorch }}"
timeout-minutes: 55
54 changes: 54 additions & 0 deletions dockers/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Docker images

## Build images from Dockerfiles

You can build it on your own, note it takes lots of time, be prepared.

```bash
git clone https://github.com/Lightning-AI/torchmetrics.git

# build with the default arguments
docker image build -t torchmetrics:latest -f dockers/ubuntu-cuda/Dockerfile .

# build with specific arguments
docker image build -t torchmetrics:ubuntu-cuda11.7.1-py3.9-torch1.13 \
-f dockers/base-cuda/Dockerfile \
--build-arg PYTHON_VERSION=3.9 \
--build-arg PYTORCH_VERSION=1.13 \
--build-arg CUDA_VERSION=11.7.1 \
.
```

To run your docker use

```bash
docker image list
docker run --rm -it torchmetrics:latest bash
```

and if you do not need it anymore, just clean it:

```bash
docker image list
docker image rm torchmetrics:latest
```

## Run docker image with GPUs

To run docker image with access to your GPUs, you need to install

```bash
# Add the package repositories
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list

sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
sudo systemctl restart docker
```

and later run the docker image with `--gpus all`. For example,

```bash
docker run --rm -it --gpus all torchmetrics:ubuntu-cuda11.7.1-py3.9-torch1.12
```
Loading

0 comments on commit 886c09b

Please sign in to comment.