From ac71a1a461ca71a7bbe7a421c6b379f658d6bf78 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Wed, 9 Mar 2022 15:23:27 -0800 Subject: [PATCH] [docker] simplify and update rocm dockerfile (#1819) --- docker/Dockerfile.rocm | 201 +++-------------------------- docs/_tutorials/getting-started.md | 3 +- 2 files changed, 16 insertions(+), 188 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index f96ac1d0f305..c892503a6570 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -1,186 +1,15 @@ -FROM rocm/pytorch:latest - - -############################################################################## -# Temporary Installation Directory -############################################################################## -ENV STAGE_DIR=/tmp -RUN mkdir -p ${STAGE_DIR} - -############################################################################## -# Installation/Basic Utilities -############################################################################## -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - software-properties-common build-essential autotools-dev \ - nfs-common pdsh \ - cmake g++ gcc \ - curl wget vim tmux emacs less unzip \ - htop iftop iotop ca-certificates openssh-client openssh-server \ - rsync iputils-ping net-tools sudo \ - llvm-9-dev - -############################################################################## -# Installation Latest Git -############################################################################## -RUN add-apt-repository ppa:git-core/ppa -y && \ - apt-get update && \ - apt-get install -y git && \ - git --version - -############################################################################## -# Client Liveness & Uncomment Port 22 for SSH Daemon -############################################################################## -# Keep SSH client alive from server side -RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config -RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \ - sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config - -############################################################################## -# Mellanox OFED -############################################################################## -#ENV MLNX_OFED_VERSION=4.6-1.0.1.1 -#RUN apt-get install -y libnuma-dev -#RUN cd ${STAGE_DIR} && \ -# wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \ -# cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \ -# ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ -# cd ${STAGE_DIR} && \ -# rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64* - -############################################################################## -# OPENMPI -############################################################################## -#ENV OPENMPI_BASEVERSION=4.0 -#ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1 -#RUN cd ${STAGE_DIR} && \ -# wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ -# cd openmpi-${OPENMPI_VERSION} && \ -# ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ -# make -j"$(nproc)" install && \ -# ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ -# # Sanity check: -# test -f /usr/local/mpi/bin/mpic++ && \ -# cd ${STAGE_DIR} && \ -# rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} -#ENV PATH=/usr/local/mpi/bin:${PATH} \ -# LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} -## Create a wrapper for OpenMPI to allow running as root by default -#RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ -# echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ -# echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ -# chmod a+x /usr/local/mpi/bin/mpirun - -############################################################################## -# Python -############################################################################## -ENV DEBIAN_FRONTEND=noninteractive -ENV PYTHON_VERSION=3.6 -RUN apt-get install -y python3.6 python3.6-dev && \ - rm -f /usr/bin/python && \ - ln -s /usr/bin/python3.6 /usr/bin/python && \ - curl -O https://bootstrap.pypa.io/get-pip.py && \ - python get-pip.py && \ - rm get-pip.py && \ - pip install --upgrade pip && \ - # Print python an pip version - python -V && pip -V -RUN pip install pyyaml -RUN pip install ipython - -############################################################################## -# TensorFlow -############################################################################## -RUN pip install tensorflow-rocm - -############################################################################## -# Some Packages -############################################################################## -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - libsndfile-dev \ - libjpeg-dev \ - libpng-dev \ - screen -RUN pip install psutil \ - yappi \ - cffi \ - ipdb \ - pandas \ - matplotlib \ - py3nvml \ - pyarrow \ - graphviz \ - astor \ - boto3 \ - tqdm \ - sentencepiece \ - msgpack \ - requests \ - pandas \ - sphinx \ - sphinx_rtd_theme \ - scipy \ - numpy \ - sklearn \ - scikit-learn \ - mpi4py \ - h5py - -############################################################################## -## SSH daemon port inside container cannot conflict with host OS port -############################################################################### -ENV SSH_PORT=2222 -RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \ - sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config - -############################################################################## -# PyTorch -############################################################################## -#ENV PYTORCH_VERSION=1.2.0 -#ENV TORCHVISION_VERSION=0.4.0 -#ENV TENSORBOARDX_VERSION=1.8 -#RUN pip install torch==${PYTORCH_VERSION} -#RUN pip install torchvision==${TORCHVISION_VERSION} -#RUN pip install tensorboardX==${TENSORBOARDX_VERSION} - -############################################################################## -# PyYAML build issue -# https://stackoverflow.com/a/53926898 -############################################################################## -RUN rm -rf /usr/lib/python3/dist-packages/yaml && \ - rm -rf /usr/lib/python3/dist-packages/PyYAML-* - -############################################################################## -## CuPy installation -############################################################################### -RUN git clone https://github.com/ROCmSoftwarePlatform/cupy ${STAGE_DIR}/cupy -RUN cd ${STAGE_DIR}/cupy && \ - git submodule update --init && \ - CUPY_INSTALL_USE_HIP=1 ROCM_HOME=/opt/rocm pip install -e . --no-cache-dir -vvvv -RUN rm -rf ${STAGE_DIR}/cupy - -############################################################################## -## Add deepspeed user -############################################################################### -# Add a deepspeed user with user id 8877 -#RUN useradd --create-home --uid 8877 deepspeed -#RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed -#RUN usermod -aG sudo deepspeed -#RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers -# # Change to non-root privilege -#USER deepspeed - -############################################################################## -# DeepSpeed -############################################################################## -RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}/DeepSpeed -RUN cd ${STAGE_DIR}/DeepSpeed && \ - git checkout . && \ - git checkout master && \ - cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \ - cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/amd_hip_cooperative_groups.h && \ - cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \ - DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo -RUN rm -rf ${STAGE_DIR}/DeepSpeed -RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)" +FROM rocm/pytorch:rocm5.0.1_ubuntu18.04_py3.7_pytorch_1.10.0 + +# XXX: patch rocm coop groups, remove once patch is upstreamed +RUN mkdir /tmp/staging && cd /tmp/staging && \ + git clone https://github.com/microsoft/DeepSpeed.git && \ + cd DeepSpeed && \ + cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \ + cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/amd_hip_cooperative_groups.h && \ + cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \ + cd /tmp && \ + rm -rf /tmp/staging + +# install latest released version of deepspeed +RUN pip install deepspeed && \ + ds_report diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md index 1e45babd569b..a86d169daf9c 100644 --- a/docs/_tutorials/getting-started.md +++ b/docs/_tutorials/getting-started.md @@ -8,10 +8,9 @@ date: 2020-05-15 ## Installation * Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/). -* Please see our [Azure tutorial](/tutorials/azure/) to get started with DeepSpeed on Azure! * To get started with DeepSpeed on AzureML, please see the [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed) -* If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies. * DeepSpeed has direct integrations with [HuggingFace Transformers](https://github.com/huggingface/transformers) and [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning). HuggingFace Transformers users can now easily accelerate their models with DeepSpeed through a simple ``--deepspeed`` flag + config file [See more details](https://huggingface.co/transformers/main_classes/trainer.html#deepspeed). PyTorch Lightning provides easy access to DeepSpeed through the Lightning Trainer [See more details](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html?highlight=deepspeed#deepspeed). +* DeepSpeed on AMD can be used via our [ROCm images](https://hub.docker.com/r/deepspeed/rocm501/tags), e.g., `docker pull deepspeed/rocm501:ds060_pytorch110`.