Skip to content

Commit

Permalink
build more docker configs (Lightning-AI#3533)
Browse files Browse the repository at this point in the history
* update build cases

* list

* matrix

* matrix

* builds

* docker

* -j1

* -q

* -q

* sep

* docker

* docker

* mergify

* -j1

* -j1

* horovod

* copy
  • Loading branch information
Borda authored Sep 22, 2020
1 parent c591013 commit 37a59be
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 54 deletions.
19 changes: 9 additions & 10 deletions .github/workflows/ci_dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ jobs:
strategy:
fail-fast: false
matrix:
python_version: [3.7]
pytorch_version: [1.6]
python_version: [3.7, 3.8]
pytorch_version: [1.5]
steps:
- name: Checkout
uses: actions/checkout@v2
Expand All @@ -27,15 +27,15 @@ jobs:
dockerfile: dockers/conda/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }}
push: false
timeout-minutes: 40
timeout-minutes: 50

build-XLA:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
python_version: [3.7]
xla_version: ["nightly"]
xla_version: [1.6, "nightly"]
steps:
- name: Checkout
uses: actions/checkout@v2
Expand All @@ -47,21 +47,20 @@ jobs:
dockerfile: dockers/base-xla/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},XLA_VERSION=${{ matrix.xla_version }}
push: false
timeout-minutes: 40
timeout-minutes: 50

build-cuda:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
python_version: [3.7]
pytorch_version: [1.6]
pytorch_channel: [pytorch]
# https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations
include:
- python_version: 3.7
pytorch_version: 1.7
pytorch_channel: pytorch-nightly
- python_version: 3.8
pytorch_version: 1.5
pytorch_channel: pytorch
steps:
- name: Checkout
uses: actions/checkout@v2
Expand All @@ -73,4 +72,4 @@ jobs:
dockerfile: dockers/base-cuda/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},PYTORCH_CHANNEL=${{ matrix.pytorch_channel }}
push: false
timeout-minutes: 40
timeout-minutes: 50
28 changes: 18 additions & 10 deletions .github/workflows/docker-builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v2

# TODO: move this to nightly events
- name: Publish Master to Docker
# publish master
uses: docker/build-push-action@v1.1.0
Expand All @@ -34,7 +35,7 @@ jobs:
dockerfile: dockers/conda/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }}
tags: "nightly-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
timeout-minutes: 40
timeout-minutes: 55

- name: Get release version
if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'release'
Expand All @@ -52,7 +53,7 @@ jobs:
dockerfile: dockers/conda/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ env.RELEASE_VERSION }}
tags: "${{ env.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
timeout-minutes: 40
timeout-minutes: 55

build-XLA:
runs-on: ubuntu-20.04
Expand All @@ -75,20 +76,27 @@ jobs:
dockerfile: dockers/base-xla/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},XLA_VERSION=${{ matrix.xla_version }}
tags: "base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}"
timeout-minutes: 35
timeout-minutes: 55

build-cuda:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
python_version: [3.7]
pytorch_version: [1.3, 1.4, 1.5, 1.6]
pytorch_channel: [pytorch]
python_version: [3.6, 3.7, 3.8]
pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7]
pytorch_channel: ["pytorch", "pytorch-nightly"]
# https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations
include:
- python_version: 3.7
pytorch_version: 1.7
exclude:
- pytorch_version: 1.7
pytorch_channel: pytorch
- pytorch_version: 1.3
pytorch_channel: pytorch-nightly
- pytorch_version: 1.4
pytorch_channel: pytorch-nightly
- pytorch_version: 1.5
pytorch_channel: pytorch-nightly
- pytorch_version: 1.6
pytorch_channel: pytorch-nightly
steps:
- name: Checkout
Expand All @@ -104,4 +112,4 @@ jobs:
dockerfile: dockers/base-cuda/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},PYTORCH_CHANNEL=${{ matrix.pytorch_channel }}
tags: "base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
timeout-minutes: 40
timeout-minutes: 55
2 changes: 1 addition & 1 deletion .mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pull_request_rules:
# no requested chnages from any reviewer
- "#changes-requested-reviews-by=0"
# this serves as ALL check has to pass as we have actually around 40 tests in total
- "#status-success>=47"
- "#status-success>=50"
# this is just in case since we rely on GPU tests (note: redundand to the above)
- status-success=continuous-integration/drone/pr
- "status-success=ci/circleci: TPU-tests"
Expand Down
50 changes: 28 additions & 22 deletions dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ ARG CUDNN_VERSION=7
ARG CUDA_VERSION=10.1

FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel
# FROM nvidia/cuda:${CUDA_VERSION}-devel

ARG PYTHON_VERSION=3.7
ARG PYTORCH_VERSION=1.6
Expand All @@ -17,22 +18,14 @@ ARG CONDA_VERSION=4.7.12

SHELL ["/bin/bash", "-c"]

ENV HOROVOD_GPU_OPERATIONS=NCCL
ENV HOROVOD_WITH_PYTORCH=1
ENV HOROVOD_WITHOUT_TENSORFLOW=1
ENV HOROVOD_WITHOUT_MXNET=1
ENV HOROVOD_WITH_GLOO=1
ENV HOROVOD_WITHOUT_MPI=1
ENV PATH="$PATH:/root/.local/bin"
# TODO: uncomment in horovod next release, https://github.com/horovod/horovod/pull/2239
# ENV MAKEFLAGS="-j$(nproc)"

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
curl \
ca-certificates \
build-essential \
cmake \
git \
curl \
ca-certificates \
&& \
# Cleaning
apt-get autoremove -y && \
Expand Down Expand Up @@ -61,31 +54,44 @@ ENV PATH="${WORKDIR}/miniconda/bin:$PATH"
ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/lib:$LD_LIBRARY_PATH"
ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda"

ENV HOROVOD_GPU_OPERATIONS=NCCL
ENV HOROVOD_WITH_PYTORCH=1
ENV HOROVOD_WITHOUT_TENSORFLOW=1
ENV HOROVOD_WITHOUT_MXNET=1
ENV HOROVOD_WITH_GLOO=1
ENV HOROVOD_WITHOUT_MPI=1
# TODO: uncomment in horovod next release, https://github.com/horovod/horovod/pull/2239
# ENV MAKEFLAGS="-j$(nproc)"

# conda init
RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION pytorch=$PYTORCH_VERSION torchvision cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \
conda init bash && \
# NOTE: this requires that the channel is presented in the yaml before packages
python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \
python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([l for l in req if 'horovod' not in l])" && \
conda env update --file environment.yml && \
conda clean -ya && \
rm environment.yml && \
# Disable cache
conda install "pip>20.1" -y && \
pip config set global.cache-dir false
rm environment.yml

ENV PATH ${WORKDIR}/miniconda/envs/${CONDA_ENV}/bin:$PATH
ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH"
# if you want this environment to be the default one, uncomment the following line:
ENV CONDA_DEFAULT_ENV=${CONDA_ENV}

COPY ./requirements/test.txt requirements-tests.txt
COPY ./requirements/examples.txt requirements-examples.txt
COPY --chown=flash ./requirements/extra.txt requirements-extra.txt
COPY --chown=flash ./requirements/test.txt requirements-tests.txt
COPY --chown=flash ./requirements/examples.txt requirements-examples.txt

RUN \
echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \
source ~/.bashrc && \
# Disable cache
pip config set global.cache-dir false && \
#echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
#echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \
#source ~/.bashrc && \
# filter only Horovod
python -c "fname = 'requirements-extra.txt' ; req = open(fname).readlines() ; open(fname, 'w').writelines([l for l in req if 'horovod' in l])" && \
# Install all requirements
pip install --global-option="--quiet" -r requirements-extra.txt && \
pip install -r requirements-tests.txt --upgrade-strategy only-if-needed && \
pip install -r requirements-examples.txt --upgrade-strategy only-if-needed && \
rm requirements* && \
Expand Down
19 changes: 10 additions & 9 deletions dockers/conda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-devel

# install versions
ARG PYTHON_VERSION=3.7
ARG PYTORCH_VERSION=1.4
ARG PYTORCH_VERSION=1.6
ARG PYTORCH_CHANNEL=pytorch
ARG LIGHTNING_VERSION=""
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
Expand Down Expand Up @@ -45,16 +45,13 @@ ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda"
COPY --chown=flash environment.yml environment.yml

# conda init
RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION && \
RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION pytorch=$PYTORCH_VERSION torchvision cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \
conda init bash && \
# conda install -y python=$PYTHON_VERSION && \
conda install pytorch=$PYTORCH_VERSION cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \
# NOTE: this requires that the channel is presented in the yaml before packages
python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \
conda env update --file environment.yml && \
rm environment.yml && \

# Disable cache
conda install "pip>20.1" && \
pip config set global.cache-dir false
conda clean -ya && \
rm environment.yml

ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH"
# if you want this environment to be the default one, uncomment the following line:
Expand All @@ -65,6 +62,9 @@ COPY --chown=flash ./ ./pytorch-lightning/

# install dependencies
RUN \
# Disable cache
#conda install "pip>20.1" && \
#pip config set global.cache-dir false && \
if [ -z $LIGHTNING_VERSION ] ; then \
pip install ./pytorch-lightning --upgrade-strategy only-if-needed ; \
rm -rf pytorch-lightning ; \
Expand All @@ -75,6 +75,7 @@ RUN \

RUN python --version && \
pip --version && \
pip list && \
python -c "import pytorch_lightning as pl; print(pl.__version__)"

CMD ["/bin/bash"]
4 changes: 2 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ channels:
- pytorch

dependencies:
#- python=3.7.6
- pip>=20.0.2
- python>=3.6
- pip
- numpy>=1.16.4
- pytorch>=1.3
- future>=0.17.1
Expand Down

0 comments on commit 37a59be

Please sign in to comment.