Skip to content

Commit

Permalink
[docker] simplify and update rocm dockerfile (#1819)
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffra authored Mar 9, 2022
1 parent 097efeb commit ac71a1a
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 188 deletions.
201 changes: 15 additions & 186 deletions docker/Dockerfile.rocm
Original file line number Diff line number Diff line change
@@ -1,186 +1,15 @@
FROM rocm/pytorch:latest


##############################################################################
# Temporary Installation Directory
##############################################################################
ENV STAGE_DIR=/tmp
RUN mkdir -p ${STAGE_DIR}

##############################################################################
# Installation/Basic Utilities
##############################################################################
RUN apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common build-essential autotools-dev \
nfs-common pdsh \
cmake g++ gcc \
curl wget vim tmux emacs less unzip \
htop iftop iotop ca-certificates openssh-client openssh-server \
rsync iputils-ping net-tools sudo \
llvm-9-dev

##############################################################################
# Installation Latest Git
##############################################################################
RUN add-apt-repository ppa:git-core/ppa -y && \
apt-get update && \
apt-get install -y git && \
git --version

##############################################################################
# Client Liveness & Uncomment Port 22 for SSH Daemon
##############################################################################
# Keep SSH client alive from server side
RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config

##############################################################################
# Mellanox OFED
##############################################################################
#ENV MLNX_OFED_VERSION=4.6-1.0.1.1
#RUN apt-get install -y libnuma-dev
#RUN cd ${STAGE_DIR} && \
# wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
# cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
# ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
# cd ${STAGE_DIR} && \
# rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*

##############################################################################
# OPENMPI
##############################################################################
#ENV OPENMPI_BASEVERSION=4.0
#ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1
#RUN cd ${STAGE_DIR} && \
# wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
# cd openmpi-${OPENMPI_VERSION} && \
# ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
# make -j"$(nproc)" install && \
# ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
# # Sanity check:
# test -f /usr/local/mpi/bin/mpic++ && \
# cd ${STAGE_DIR} && \
# rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
#ENV PATH=/usr/local/mpi/bin:${PATH} \
# LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
## Create a wrapper for OpenMPI to allow running as root by default
#RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
# echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
# echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
# chmod a+x /usr/local/mpi/bin/mpirun

##############################################################################
# Python
##############################################################################
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHON_VERSION=3.6
RUN apt-get install -y python3.6 python3.6-dev && \
rm -f /usr/bin/python && \
ln -s /usr/bin/python3.6 /usr/bin/python && \
curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py && \
pip install --upgrade pip && \
# Print python an pip version
python -V && pip -V
RUN pip install pyyaml
RUN pip install ipython

##############################################################################
# TensorFlow
##############################################################################
RUN pip install tensorflow-rocm

##############################################################################
# Some Packages
##############################################################################
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libsndfile-dev \
libjpeg-dev \
libpng-dev \
screen
RUN pip install psutil \
yappi \
cffi \
ipdb \
pandas \
matplotlib \
py3nvml \
pyarrow \
graphviz \
astor \
boto3 \
tqdm \
sentencepiece \
msgpack \
requests \
pandas \
sphinx \
sphinx_rtd_theme \
scipy \
numpy \
sklearn \
scikit-learn \
mpi4py \
h5py

##############################################################################
## SSH daemon port inside container cannot conflict with host OS port
###############################################################################
ENV SSH_PORT=2222
RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config

##############################################################################
# PyTorch
##############################################################################
#ENV PYTORCH_VERSION=1.2.0
#ENV TORCHVISION_VERSION=0.4.0
#ENV TENSORBOARDX_VERSION=1.8
#RUN pip install torch==${PYTORCH_VERSION}
#RUN pip install torchvision==${TORCHVISION_VERSION}
#RUN pip install tensorboardX==${TENSORBOARDX_VERSION}

##############################################################################
# PyYAML build issue
# https://stackoverflow.com/a/53926898
##############################################################################
RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
rm -rf /usr/lib/python3/dist-packages/PyYAML-*

##############################################################################
## CuPy installation
###############################################################################
RUN git clone https://github.com/ROCmSoftwarePlatform/cupy ${STAGE_DIR}/cupy
RUN cd ${STAGE_DIR}/cupy && \
git submodule update --init && \
CUPY_INSTALL_USE_HIP=1 ROCM_HOME=/opt/rocm pip install -e . --no-cache-dir -vvvv
RUN rm -rf ${STAGE_DIR}/cupy

##############################################################################
## Add deepspeed user
###############################################################################
# Add a deepspeed user with user id 8877
#RUN useradd --create-home --uid 8877 deepspeed
#RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed
#RUN usermod -aG sudo deepspeed
#RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
# # Change to non-root privilege
#USER deepspeed

##############################################################################
# DeepSpeed
##############################################################################
RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
RUN cd ${STAGE_DIR}/DeepSpeed && \
git checkout . && \
git checkout master && \
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/amd_hip_cooperative_groups.h && \
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \
DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
RUN rm -rf ${STAGE_DIR}/DeepSpeed
RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"
FROM rocm/pytorch:rocm5.0.1_ubuntu18.04_py3.7_pytorch_1.10.0

# XXX: patch rocm coop groups, remove once patch is upstreamed
RUN mkdir /tmp/staging && cd /tmp/staging && \
git clone https://github.com/microsoft/DeepSpeed.git && \
cd DeepSpeed && \
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/amd_hip_cooperative_groups.h && \
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \
cd /tmp && \
rm -rf /tmp/staging

# install latest released version of deepspeed
RUN pip install deepspeed && \
ds_report
3 changes: 1 addition & 2 deletions docs/_tutorials/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ date: 2020-05-15
## Installation

* Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/).
* Please see our [Azure tutorial](/tutorials/azure/) to get started with DeepSpeed on Azure!
* To get started with DeepSpeed on AzureML, please see the [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed)
* If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.
* DeepSpeed has direct integrations with [HuggingFace Transformers](https://github.com/huggingface/transformers) and [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning). HuggingFace Transformers users can now easily accelerate their models with DeepSpeed through a simple ``--deepspeed`` flag + config file [See more details](https://huggingface.co/transformers/main_classes/trainer.html#deepspeed). PyTorch Lightning provides easy access to DeepSpeed through the Lightning Trainer [See more details](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html?highlight=deepspeed#deepspeed).
* DeepSpeed on AMD can be used via our [ROCm images](https://hub.docker.com/r/deepspeed/rocm501/tags), e.g., `docker pull deepspeed/rocm501:ds060_pytorch110`.



Expand Down

0 comments on commit ac71a1a

Please sign in to comment.