Dockerfile.test.cpu

ARG UBUNTU_VERSION=20.04
FROM ubuntu:${UBUNTU_VERSION}

# Arguments for the build. UBUNTU_VERSION needs to be repeated because
# the first usage only applies to the FROM tag.
ARG UBUNTU_VERSION=20.04
ARG MPI_KIND=OpenMPI
ARG PYTHON_VERSION=3.6
ARG GPP_VERSION=7
# NOTE: keep versions in sync with setup.py extras_require{'dev'}:
ARG TENSORFLOW_PACKAGE=tensorflow-cpu==1.15.0
ARG KERAS_PACKAGE=keras==2.2.4
ARG PYTORCH_PACKAGE=torch==1.2.0+cpu
ARG PYTORCH_LIGHTNING_PACKAGE=pytorch_lightning==0.7.6
ARG TORCHVISION_PACKAGE=torchvision==0.4.0+cpu
ARG MXNET_PACKAGE=mxnet==1.5.0
ARG PYSPARK_PACKAGE=pyspark==2.4.7
# if SPARK_PACKAGE is set, installs Spark into /spark from the tgz archive
# if SPARK_PACKAGE is a preview version, installs PySpark from the tgz archive
# see https://archive.apache.org/dist/spark/ for available packages, version must match PYSPARK_PACKAGE
ARG SPARK_PACKAGE=spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
ARG CCL_PACKAGE=master
ARG HOROVOD_BUILD_FLAGS=""

# to avoid interaction with apt-get
ENV DEBIAN_FRONTEND=noninteractive

# Set default shell to /bin/bash
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

# Log given ARGs (and all other environment vars)
RUN env | sort

# Prepare to install specific g++ versions
RUN apt-get update -qq && apt-get install -y --no-install-recommends software-properties-common && \
    rm -rf /var/lib/apt/lists/*

RUN add-apt-repository ppa:ubuntu-toolchain-r/test

# Install essential packages.
RUN apt-get update -qq && apt-get install -y --no-install-recommends \
        wget \
        ca-certificates \
        cmake \
        openssh-client \
        openssh-server \
        git \
        build-essential \
        g++-${GPP_VERSION} \
        moreutils && \
    rm -rf /var/lib/apt/lists/*

# setup ssh service
RUN ssh-keygen -f /root/.ssh/id_rsa -q -N ''
RUN cp -v /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys

# Install Python.
RUN apt-get update -qq && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-distutils && \
    rm -rf /var/lib/apt/lists/*
RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python
RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python${PYTHON_VERSION/%.*/}
RUN if [[ ${PYTHON_VERSION} != "3.7" ]]; then \
        wget --progress=dot:mega https://bootstrap.pypa.io/get-pip.py; \
    else \
        wget --progress=dot:mega https://bootstrap.pypa.io/pip/3.7/get-pip.py; \
    fi && python get-pip.py && rm get-pip.py

RUN pip install --no-cache-dir -U --force requests pytest mock pytest-forked parameterized

# Add launch helper scripts
RUN echo "env SPARK_HOME=/spark SPARK_DRIVER_MEM=512m PYSPARK_PYTHON=/usr/bin/python${PYTHON_VERSION} PYSPARK_DRIVER_PYTHON=/usr/bin/python${PYTHON_VERSION} \"\$@\"" > /spark_env.sh
RUN echo /spark_env.sh pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > /pytest.sh
RUN echo /spark_env.sh pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.\$1.standalone.\$2.xml --forked \${@:2} > /pytest_standalone.sh
RUN chmod a+x /spark_env.sh
RUN chmod a+x /pytest.sh
RUN chmod a+x /pytest_standalone.sh

# Install Spark stand-alone cluster.
RUN if [[ -n ${SPARK_PACKAGE} ]]; then \
        wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/${SPARK_PACKAGE}?action=download" -O - | tar -xzC /tmp; \
        archive=$(basename "${SPARK_PACKAGE}") bash -c "mv -v /tmp/\${archive/%.tgz/} /spark"; \
    fi

# Install PySpark.
RUN apt-get update -qq && apt install -y openjdk-8-jdk-headless
RUN if [[ ${SPARK_PACKAGE} != *"-preview"* ]]; then \
        pip install --no-cache-dir ${PYSPARK_PACKAGE}; \
    else \
        apt-get update -qq && apt-get install pandoc; \
        pip install --no-cache-dir pypandoc; \
        (cd /spark/python && python setup.py sdist && pip install --no-cache-dir dist/pyspark-*.tar.gz && rm dist/pyspark-*); \
    fi

# Pin cloudpickle to 1.3.0
# Dill breaks clouldpickle > 1.3.0 when using Spark2
# https://github.com/cloudpipe/cloudpickle/issues/393
RUN if [[ ${PYSPARK_PACKAGE} == "pyspark==2."* ]]; then \
        pip install --no-cache-dir cloudpickle==1.3.0; \
    fi

# Install Ray.
RUN pip install --no-cache-dir ray

# Install MPI.
RUN if [[ ${MPI_KIND} == "OpenMPI" ]]; then \
        wget --progress=dot:mega -O /tmp/openmpi-4.1.4-bin.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.4.tar.gz && \
            cd /tmp && tar -zxf /tmp/openmpi-4.1.4-bin.tar.gz && \
            mkdir openmpi-4.1.4/build && cd openmpi-4.1.4/build && ../configure --prefix=/usr/local && \
            make -j all && make install && ldconfig && \
            echo "mpirun -allow-run-as-root -np 2 -H localhost:2 -bind-to none -map-by slot -mca mpi_abort_print_stack 1 -tag-output" > /mpirun_command; \
    elif [[ ${MPI_KIND} == "ONECCL" ]]; then \
        wget --progress=dot:mega -O /tmp/oneccl.tar.gz https://github.com/oneapi-src/oneCCL/archive/${CCL_PACKAGE}.tar.gz && \
            cd /tmp && tar -zxf oneccl.tar.gz && \
            mkdir oneCCL-${CCL_PACKAGE}/build && cd oneCCL-${CCL_PACKAGE}/build && cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local/oneccl -DCMAKE_BUILD_TYPE=Release && make -j install && \
            rm /tmp/oneccl.tar.gz && rm -Rf /tmp/oneCCL-${CCL_PACKAGE} && \
            sed -i 's/if \[ -z \"\${I_MPI_ROOT}\" \]/if [ -z \"${I_MPI_ROOT:-}\" ]/g' /usr/local/oneccl/env/setvars.sh && \
            sed -i 's/ \$1/ \${1:-}/g' /usr/local/oneccl/env/setvars.sh && \
            echo ". /usr/local/oneccl/env/setvars.sh" > /oneccl_env && \
            chmod +x /oneccl_env && \
            echo "export CCL_ATL_TRANSPORT=ofi; \
                  export HOROVOD_CCL_CACHE=1; \
                  echo \"\$(env)\"; \
                  echo \"mpirun is \$(which mpirun)\"; \
                  echo \"LD_LIBRARY_PATH is \$(echo \$LD_LIBRARY_PATH)\"; \
                  echo \"oneCCL links with \$(ldd /usr/local/oneccl/lib/libccl.so)\"; \
                  mpirun -np 2 -hosts localhost \$@" > /mpirun_command_ofi && \
            chmod +x /mpirun_command_ofi && \
            cp /mpirun_command_ofi /mpirun_command_mpi && \
            sed -i 's/export CCL_ATL_TRANSPORT=ofi;/export CCL_ATL_TRANSPORT=mpi;/g' /mpirun_command_mpi && \
            sed -i 's/export HOROVOD_CCL_CACHE=1;/export HOROVOD_CCL_CACHE=0;/g' /mpirun_command_mpi && \
            echo "/mpirun_command_mpi" > /mpirun_command && \
            echo "-L/usr/local/oneccl/lib -lmpi -I/usr/local/oneccl/include" > /mpicc_oneccl && \
            chmod +x /mpicc_oneccl; \
    elif [[ ${MPI_KIND} == "MPICH" ]]; then \
        apt-get update -qq && apt-get install -y mpich && \
            echo "mpirun -np 2 -l" > /mpirun_command; \
    fi

# Install mpi4py.
# This requires SETUPTOOLS_USE_DISTUTILS=stdlib as with setuptools>=60.1.0 installing mpi4py broke
# https://github.com/mpi4py/mpi4py/issues/157#issuecomment-1001022274
RUN if [[ ${MPI_KIND} != "None" ]]; then \
        if [[ ${MPI_KIND} == "ONECCL" ]]; then \
            export I_MPI_ROOT=/usr/local/oneccl; \
            export MPICC=/usr/local/oneccl/bin/mpicc; \
        fi; \
        SETUPTOOLS_USE_DISTUTILS=stdlib pip install --no-cache-dir mpi4py; \
    fi

# Install TensorFlow and Keras (releases).
# Pin scipy!=1.4.0: https://github.com/scipy/scipy/issues/11237
# Pin protobuf~=3.20 for tensorflow<2.6.5: https://github.com/tensorflow/tensorflow/issues/56077
RUN if [[ ${TENSORFLOW_PACKAGE} != "tf-nightly" ]]; then \
        PROTOBUF_PACKAGE=""; \
        if [[ ${TENSORFLOW_PACKAGE} == tensorflow*==1.15.* ]] || \
           [[ ${TENSORFLOW_PACKAGE} == tensorflow-cpu==2.[012345].* ]]; then \
            PROTOBUF_PACKAGE="protobuf~=3.20"; \
        fi; \
        pip install --no-cache-dir ${TENSORFLOW_PACKAGE} ${PROTOBUF_PACKAGE}; \
        if [[ ${KERAS_PACKAGE} != "None" ]]; then \
            pip uninstall -y keras; \
            pip install --no-cache-dir ${KERAS_PACKAGE} "scipy!=1.4.0" "pandas<1.1.0" "numpy<1.24.0"; \
        fi; \
        mkdir -p ~/.keras; \
        python -c "import tensorflow as tf; tf.keras.datasets.mnist.load_data()"; \
    fi

# Pin h5py < 3 for tensorflow: https://github.com/tensorflow/tensorflow/issues/44467
RUN pip install 'h5py<3.0' 'numpy<1.24.0' --force-reinstall

# Install PyTorch (releases).
# Pin Pillow<7.0 for torchvision < 0.5.0: https://github.com/pytorch/vision/issues/1718
# Pin Pillow!=8.3.0 for torchvision: https://github.com/pytorch/vision/issues/4146
# installing pytorch lightning < 1.8.4 with pip > 23.2.1 fails with:
#   invalid metadata: .* suffix can only be used with == or != operators
# https://github.com/pypa/pipx/issues/998
RUN if [[ ${PYTORCH_PACKAGE} != "torch-nightly" ]]; then \
        pip install --no-cache-dir ${PYTORCH_PACKAGE} ${TORCHVISION_PACKAGE} -f https://download.pytorch.org/whl/torch_stable.html; \
        if [[ "${TORCHVISION_PACKAGE/%+*/}" == torchvision==0.[1234].* ]]; then \
            pip install --no-cache-dir "Pillow<7.0" --no-deps; \
        else \
            pip install --no-cache-dir "Pillow!=8.3.0" --no-deps; \
        fi; \
        \
        IFS=. read maj min patch <<< `cut -d "=" -f 3 <<< "${PYTORCH_LIGHTNING_PACKAGE}"`; \
        if [[ $maj < 1 || $maj == 1 && ( $min < 8 || $min == 8 && $patch < 4) ]]; then \
            pip install -U pip==23.2.1; \
        fi; \
        \
        pip install ${PYTORCH_LIGHTNING_PACKAGE}; \
    fi


# Install MXNet (releases).
RUN if [[ ${MXNET_PACKAGE} != "mxnet-nightly" ]]; then \
        pip install --no-cache-dir ${MXNET_PACKAGE} ; \
    fi

# Prefetch Spark MNIST dataset.
RUN mkdir -p /work /data && wget --progress=dot:mega https://horovod-datasets.s3.amazonaws.com/mnist.bz2 -O /data/mnist.bz2

# Prefetch Spark Rossmann dataset.
RUN mkdir -p /work /data && wget --progress=dot:mega https://horovod-datasets.s3.amazonaws.com/rossmann.tgz -O - | tar -xzC /data

# Prefetch PyTorch datasets.
RUN wget --progress=dot:mega https://horovod-datasets.s3.amazonaws.com/pytorch_datasets.tgz -O - | tar -xzC /data

### END OF CACHE ###
COPY . /horovod

# Install nightly packages here so they do not get cached

# Install TensorFlow and Keras (nightly).
# Pin scipy!=1.4.0: https://github.com/scipy/scipy/issues/11237
RUN if [[ ${TENSORFLOW_PACKAGE} == "tf-nightly" ]]; then \
        pip install --no-cache-dir ${TENSORFLOW_PACKAGE}; \
        if [[ ${KERAS_PACKAGE} != "None" ]]; then \
            pip uninstall -y keras-nightly; \
            pip install --no-cache-dir ${KERAS_PACKAGE} "scipy!=1.4.0" "pandas<1.1.0" "numpy<1.24.0"; \
        fi; \
        mkdir -p ~/.keras; \
        python -c "import tensorflow as tf; tf.keras.datasets.mnist.load_data()"; \
    fi

# Install PyTorch (nightly).
# Pin Pillow!=8.3.0 for torchvision: https://github.com/pytorch/vision/issues/4146
# installing pytorch lightning < 1.8.4 with pip > 23.2.1 fails with:
#   invalid metadata: .* suffix can only be used with == or != operators
# https://github.com/pypa/pipx/issues/998
RUN if [[ ${PYTORCH_PACKAGE} == "torch-nightly" ]]; then \
        pip install --no-cache-dir --pre torch ${TORCHVISION_PACKAGE} -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; \
        pip install --no-cache-dir "Pillow!=8.3.0" --no-deps; \
        \
        IFS=. read maj min patch <<< `cut -d "=" -f 3 <<< "${PYTORCH_LIGHTNING_PACKAGE}"`; \
        if [[ $maj < 1 || $maj == 1 && ( $min < 8 || $min == 8 && $patch < 4) ]]; then \
            pip install -U pip==23.2.1; \
        fi; \
        \
        pip install ${PYTORCH_LIGHTNING_PACKAGE}; \
    fi

# Install MXNet (nightly).
RUN if [[ ${MXNET_PACKAGE} == "mxnet-nightly" ]]; then \
        pip install --no-cache-dir --pre mxnet -f https://dist.mxnet.io/python/all; \
    fi

# Install Horovod.
RUN if [[ ${MPI_KIND} == "ONECCL" ]]; then \
      if [ -z "${LD_LIBRARY_PATH:-}" ]; then \
          export LD_LIBRARY_PATH=""; \
      fi; \
      if [ -z "${PYTHONPATH:-}" ]; then \
          export PYTHONPATH=""; \
      fi; \
      . /usr/local/oneccl/env/setvars.sh; \
      export I_MPI_ROOT=/usr/local/oneccl; \
      echo "horovod python setup.py sdist, mpicxx is $(which mpicxx)"; \
    fi; \
    cd /horovod && \
    python setup.py sdist && \
    bash -c "${HOROVOD_BUILD_FLAGS} HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]"

# Show the effective python package version to easily spot version differences
RUN pip freeze | sort

# Assert installed package versions
RUN /horovod/assert-package-versions.sh

# Hack for compatibility of MNIST example with TensorFlow 1.1.0.
RUN if [[ ${TENSORFLOW_PACKAGE} == "tensorflow==1.1.0" ]]; then \
        sed -i "s/from tensorflow import keras/from tensorflow.contrib import keras/" /horovod/examples/tensorflow/tensorflow_mnist.py; \
    fi

# Hack TensorFlow MNIST example to be smaller.
RUN sed -i "s/last_step=20000/last_step=100/" /horovod/examples/tensorflow/tensorflow_mnist.py

# Hack TensorFlow Eager MNIST example to be smaller.
RUN sed -i "s/dataset.take(20000/dataset.take(100/" /horovod/examples/tensorflow/tensorflow_mnist_eager.py

# Hack TensorFlow 2.0 example to be smaller.
RUN sed -i "s/dataset.take(10000/dataset.take(100/" /horovod/examples/tensorflow2/tensorflow2_mnist.py

# Hack TensorFlow 2.0 Data Service example to be smaller.
RUN sed -i "s/ epochs=24/ epochs=2/" /horovod/examples/tensorflow2/tensorflow2_mnist_data_service_train_fn_*_side_dispatcher.py

# Hack Keras MNIST advanced example to be smaller.
RUN sed -i "s/'--epochs', type=int, default=24,/'--epochs', type=int, default=2,/" /horovod/examples/keras/keras_mnist_advanced.py
RUN sed -i "s/model.add(Conv2D(32, kernel_size=(3, 3),/model.add(Conv2D(1, kernel_size=(3, 3),/" /horovod/examples/keras/keras_mnist_advanced.py
RUN sed -i "s/model.add(Conv2D(64, (3, 3), activation='relu'))//" /horovod/examples/keras/keras_mnist_advanced.py

# Hack TensorFlow 2.0 Keras MNIST advanced example to be smaller.
RUN sed -i "s/epochs=24/epochs=2/" /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py
RUN sed -i "s/tf.keras.layers.Conv2D(32, \\[3, 3\\],/tf.keras.layers.Conv2D(1, [3, 3],/" /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py
RUN sed -i -E "s/\s+tf.keras.layers.Conv2D\(64, \\[3, 3\\], activation='relu'\),//" /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py

# Hack PyTorch MNIST example to be smaller.
RUN sed -i "s/'--epochs', type=int, default=10,/'--epochs', type=int, default=2,/" /horovod/examples/pytorch/pytorch_mnist.py
RUN sed -i "s/self.fc1 = nn.Linear(320, 50)/self.fc1 = nn.Linear(784, 50)/" /horovod/examples/pytorch/pytorch_mnist.py
RUN sed -i "s/x = F.relu(F.max_pool2d(self.conv1(x), 2))//" /horovod/examples/pytorch/pytorch_mnist.py
RUN sed -i "s/x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))//" /horovod/examples/pytorch/pytorch_mnist.py
RUN sed -i "s/x = x.view(-1, 320)/x = x.view(-1, 784)/" /horovod/examples/pytorch/pytorch_mnist.py

# Hack Keras Spark Rossmann Run example to be smaller.
RUN sed -i "s/x = Dense(1000,/x = Dense(100,/g" /horovod/examples/spark/keras/keras_spark_rossmann_run.py
RUN sed -i "s/x = Dense(500,/x = Dense(50,/g" /horovod/examples/spark/keras/keras_spark_rossmann_run.py

# Hack Keras Spark Rossmann Estimator example to be smaller.
RUN sed -i "s/x = Dense(1000,/x = Dense(100,/g" /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py
RUN sed -i "s/x = Dense(500,/x = Dense(50,/g" /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py