Skip to content

Commit 80e38d9

Browse files
neuron-containers-cikaenafwiFu Qiao
authored
Updated Dockerfiles (#101)
Updated Dockerfiles --------- Co-authored-by: kaenafwi <kaenafwi@aws-tonga-kaena-fwi-1fn-c0ac9e98.us-east-1.amazon.com> Co-authored-by: Fu Qiao <qiaofu@amazon.com>
1 parent 645831a commit 80e38d9

File tree

6 files changed

+864
-63
lines changed

6 files changed

+864
-63
lines changed

docker/common/apex_setup.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,32 @@
77
import subprocess
88

99
import torch
10-
from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME, load
10+
from torch.utils.cpp_extension import (
11+
BuildExtension,
12+
CppExtension,
13+
CUDAExtension,
14+
CUDA_HOME,
15+
load,
16+
)
1117

1218
setup(
1319
name="apex",
1420
version="0.1",
1521
packages=find_packages(
16-
exclude=("build", "csrc", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info",)
22+
exclude=(
23+
"build",
24+
"csrc",
25+
"include",
26+
"tests",
27+
"dist",
28+
"docs",
29+
"tests",
30+
"examples",
31+
"apex.egg-info",
32+
)
1733
),
18-
install_requires=["packaging>20.6",],
34+
install_requires=[
35+
"packaging>20.6",
36+
],
1937
description="PyTorch Extensions written by NVIDIA",
20-
)
38+
)

docker/common/nxdt_install_setup.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/usr/bin/env bash
2+
3+
set -o pipefail
4+
set -e
5+
6+
# Install megatron-core
7+
git clone https://github.com/NVIDIA/Megatron-LM.git
8+
cd Megatron-LM
9+
git checkout core_r0.10.0
10+
pip install .
11+
cd megatron/core/datasets
12+
make
13+
14+
VENV_PATH=$(pip show nemo_toolkit | grep Location | awk '{print $2}')
15+
# Remove call to get_megatron_pretrained_bert_models() as it uses Transformer Engine which we don't support
16+
sed -i 's/get_megatron_pretrained_bert_models()/[]/g' $VENV_PATH/nemo/collections/nlp/models/nlp_model.py
17+
# Remove filepath checking as it could be an S3 path when S3 checkpointing
18+
sed -i 's/ and self\._fs\.exists(ckpt_to_dir(filepath))//g' $VENV_PATH/nemo/utils/callbacks/nemo_model_checkpoint.py

docker/common/nxdt_requirements.txt

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
hydra-core>=1.2.0,<1.3
2+
omegaconf>=2.2,<2.3
3+
pyyaml==6.0.1
4+
torchmetrics>=0.4.1rc0,<=0.10.3
5+
trl==0.10.1
6+
transformers==4.48.0
7+
wandb
8+
webdataset>=0.1.48,<=0.1.62
9+
pandas
10+
sentencepiece
11+
youtokentome
12+
h5py
13+
ijson
14+
matplotlib>=3.3.2
15+
sacremoses
16+
sacrebleu
17+
einops
18+
faiss-cpu
19+
sentence_transformers>=2.3
20+
nltk>=3.6.5
21+
jieba
22+
ftfy
23+
gdown
24+
inflect
25+
jieba
26+
opencc==1.1.6
27+
pangu
28+
rapidfuzz
29+
pybind11
30+
pytorch-lightning==2.5.0
31+
ipadic
32+
mecab-python3
33+
protobuf==3.20.3
34+
datasets==2.19.1
35+
dill==0.3.8
36+
nemo_toolkit==2.1.0
37+
regex
38+
requests<2.32.0
39+
python-daemon
40+
huggingface_hub>=0.27.1
41+
multiprocess==0.70.16
42+
numba<=0.60.0
43+
numpy>=1.24.3,<=1.25.2
44+
rouge_score
45+
setuptools>=70.0
46+
lightning==2.5.0
47+
ml-dtypes==0.2.0

docker/jax/training/0.5/Dockerfile.neuronx

Lines changed: 151 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,17 @@
1-
FROM public.ecr.aws/docker/library/ubuntu:22.04
1+
ARG BUILD_STAGE=prod
2+
3+
FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base
24

35
LABEL dlc_major_version="1"
46
LABEL maintainer="Amazon AI"
57

6-
# Neuron SDK components version numbers
7-
ARG NEURONX_RUNTIME_LIB_VERSION=2.24.53.0-f239092cc
8-
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.24.59.0-838c7fc8b
9-
ARG NEURONX_TOOLS_VERSION=2.22.61.0
10-
ARG NEURONX_CC_VERSION=2.17.194.0
11-
ARG NEURONX_JAX_TRAINING_VERSION=0.1.3
12-
8+
# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
9+
ARG DEBIAN_FRONTEND=noninteractive
1310
ARG PYTHON=python3.10
1411
ARG PYTHON_VERSION=3.10.12
1512
ARG PIP=pip3
1613
ARG OMPI_VERSION=4.1.5
1714

18-
# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
19-
ARG DEBIAN_FRONTEND=noninteractive
20-
2115
# Python won’t try to write .pyc or .pyo files on the import of source modules
2216
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
2317
ENV PYTHONDONTWRITEBYTECODE=1
@@ -30,6 +24,7 @@ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
3024
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
3125
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
3226
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
27+
ENV PATH="/opt/aws/neuron/bin:${PATH}"
3328

3429
RUN apt-get update \
3530
&& apt-get upgrade -y \
@@ -86,15 +81,17 @@ RUN mkdir -p /tmp/openmpi \
8681
&& rm -rf /tmp/openmpi
8782

8883
# Install packages and configure SSH for MPI operator in k8s
89-
RUN apt-get update && apt-get install -y openmpi-bin openssh-server \
84+
RUN apt-get update \
85+
&& apt-get install -y openmpi-bin openssh-server \
9086
&& mkdir -p /var/run/sshd \
9187
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
9288
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
9389
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
9490
&& rm -rf /var/lib/apt/lists/* \
91+
&& rm -rf /tmp/tmp* \
9592
&& apt-get clean
9693

97-
# install Python
94+
# Install Python
9895
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
9996
&& tar -xzf Python-$PYTHON_VERSION.tgz \
10097
&& cd Python-$PYTHON_VERSION \
@@ -104,8 +101,26 @@ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VER
104101
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
105102
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
106103
&& ${PIP} --no-cache-dir install --upgrade \
104+
"awscli<2" \
107105
pip \
108-
setuptools
106+
requests \
107+
setuptools \
108+
&& rm -rf ~/.cache/pip/*
109+
110+
# Install EFA
111+
RUN apt-get update \
112+
&& cd $HOME \
113+
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
114+
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
115+
&& cat aws-efa-installer.key | gpg --fingerprint \
116+
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
117+
&& tar -xf aws-efa-installer-latest.tar.gz \
118+
&& cd aws-efa-installer \
119+
&& ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \
120+
&& cd $HOME \
121+
&& rm -rf /var/lib/apt/lists/* \
122+
&& rm -rf /tmp/tmp* \
123+
&& apt-get clean
109124

110125
WORKDIR /
111126

@@ -118,64 +133,141 @@ RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
118133

119134
RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
120135

121-
# Install Neuron Driver, Runtime and Tools
122-
RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
123-
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
136+
# Copy workaround script for incorrect hostname
137+
COPY changehostname.c /
138+
COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/
124139

125-
RUN apt-get update \
126-
&& apt-get install -y \
127-
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
128-
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
129-
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
140+
RUN HOME_DIR=/root \
141+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
142+
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
143+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
144+
&& chmod +x /usr/local/bin/testOSSCompliance \
145+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
146+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
147+
&& rm -rf ${HOME_DIR}/oss_compliance* \
148+
&& rm -rf /tmp/tmp*
149+
150+
# Setting up APT and PIP repo for neuron artifacts
151+
ARG NEURON_APT_REPO=https://apt.repos.neuron.amazonaws.com
152+
ARG NEURON_APT_REPO_KEY
153+
ARG NEURON_PIP_REPO=https://pip.repos.neuron.amazonaws.com
154+
ARG NEURON_PIP_REPO_KEY
155+
RUN mkdir -p /etc/apt/keyrings \
156+
&& APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
157+
&& echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \
158+
&& curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \
159+
&& PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
160+
&& ${PIP} config set global.extra-index-url "${PIP_REPO_URL}"
161+
162+
# Neuron SDK components version numbers
163+
ARG NEURON_ARTIFACT_PATH=/root/neuron_artifacts
164+
ARG IGNORE_MISSING_NEURON_COMPONENTS=false
165+
RUN IGNORE_MISSING_NEURON_COMPONENTS=$(echo ${IGNORE_MISSING_NEURON_COMPONENTS} | tr '[:upper:]' '[:lower:]')
166+
167+
ARG NEURONX_RUNTIME_LIB_VERSION=2.25.57.0-166c7a468
168+
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.25.65.0-9858ac9a1
169+
ARG NEURONX_TOOLS_VERSION=2.23.9.0
170+
171+
ARG NEURONX_CC_VERSION=2.18.121.0
172+
ARG NEURONX_JAX_TRAINING_VERSION=0.5.3.1.0.719+1d9c17be
173+
174+
FROM base AS dev
175+
176+
RUN --mount=type=bind,source=apt,target=${NEURON_ARTIFACT_PATH}/apt \
177+
install_apt_package() { \
178+
pkg_name=$1; \
179+
version_arg=$2; \
180+
if [ -f "${NEURON_ARTIFACT_PATH}/apt/${version_arg}" ]; then \
181+
apt-get install -y ${NEURON_ARTIFACT_PATH}/apt/${version_arg}; \
182+
elif [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \
183+
apt-get install -y ${pkg_name}=${version_arg}; \
184+
else \
185+
echo "Ignoring package ${pkg_name}"; \
186+
fi; \
187+
} \
188+
&& apt-get update \
189+
&& install_apt_package "aws-neuronx-collectives" "${NEURONX_COLLECTIVES_LIB_VERSION}" \
190+
&& install_apt_package "aws-neuronx-runtime-lib" "${NEURONX_RUNTIME_LIB_VERSION}" \
191+
&& install_apt_package "aws-neuronx-tools" "${NEURONX_TOOLS_VERSION}" \
130192
&& rm -rf /var/lib/apt/lists/* \
131193
&& rm -rf /tmp/tmp* \
132194
&& apt-get clean
133195

134-
# Add Neuron PATH
135-
ENV PATH="/opt/aws/neuron/bin:${PATH}"
196+
RUN --mount=type=bind,source=pip,target=${NEURON_ARTIFACT_PATH}/pip \
197+
install_pip_package() { \
198+
packages=""; \
199+
flags=""; \
200+
while [ "$#" -gt 0 ]; do \
201+
pkg_name=$(echo $1 | cut -d: -f1); \
202+
version_arg=$(echo $1 | cut -d: -f2); \
203+
extra_flags=$(echo $1 | cut -d: -f3); \
204+
if [ -f "${NEURON_ARTIFACT_PATH}/pip/${version_arg}" ]; then \
205+
packages="${packages} ${NEURON_ARTIFACT_PATH}/pip/${version_arg}"; \
206+
else \
207+
if [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \
208+
packages="${packages} ${pkg_name}==${version_arg}"; \
209+
else \
210+
echo "Ignoring package ${pkg_name}"; \
211+
fi; \
212+
fi; \
213+
# Store unique flags
214+
if [ ! -z "${extra_flags}" ]; then \
215+
for flag in $(echo "${extra_flags}" | tr ' ' '\n'); do \
216+
case " ${flags} " in \
217+
*" ${flag} "*) ;; \
218+
*) flags="${flags} ${flag}" ;; \
219+
esac \
220+
done; \
221+
fi; \
222+
shift; \
223+
done; \
224+
if [ ! -z "${packages}" ]; then \
225+
echo "Installing packages: ${packages} with flags ${flags}"; \
226+
${PIP} install --no-cache-dir --force-reinstall \
227+
--extra-index-url="file:///${NEURON_ARTIFACT_PATH}/pip" \
228+
${packages} ${flags}; \
229+
fi; \
230+
} \
231+
&& install_pip_package "neuronx-cc:${NEURONX_CC_VERSION}:" "jax-neuronx:${NEURONX_JAX_TRAINING_VERSION}:" \
232+
&& rm -rf ~/.cache/pip/*
136233

137-
# Install AWS CLI
138-
RUN ${PIP} install --no-cache-dir -U "awscli<2"
234+
FROM base AS repo
139235

140-
# Install JAX & Neuron CC
141-
RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \
142-
&& ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
143-
&& ${PIP} install --force-reinstall jax-neuronx==$NEURONX_JAX_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
236+
# Install Neuron components from the apt and pip repos
237+
RUN apt-get update \
238+
&& apt-get install -y \
239+
aws-neuronx-tools \
240+
aws-neuronx-collectives \
241+
aws-neuronx-runtime-lib \
242+
&& rm -rf /var/lib/apt/lists/* \
243+
&& rm -rf /tmp/tmp* \
244+
&& apt-get clean \
245+
&& ${PIP} install --no-cache-dir --force-reinstall \
246+
neuronx-cc \
247+
jax-neuronx \
248+
&& rm -rf ~/.cache/pip/*
144249

145-
# EFA Installer does apt get. Make sure to run apt update before that
146-
RUN apt-get update
147-
RUN cd $HOME \
148-
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
149-
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
150-
&& cat aws-efa-installer.key | gpg --fingerprint \
151-
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
152-
&& tar -xf aws-efa-installer-latest.tar.gz \
153-
&& cd aws-efa-installer \
154-
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
155-
&& cd $HOME
156250

157-
# Clean up after apt update
158-
RUN rm -rf /var/lib/apt/lists/* \
251+
FROM base AS prod
252+
253+
# Install Neuron components
254+
# Install Neuron Driver, Runtime and Tools
255+
RUN apt-get update \
256+
&& apt-get install -y \
257+
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
258+
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
259+
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
260+
&& rm -rf /var/lib/apt/lists/* \
159261
&& rm -rf /tmp/tmp* \
160262
&& apt-get clean
161263

162-
# Copy workaround script for incorrect hostname
163-
COPY changehostname.c /
164-
COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
165-
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
166-
167-
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \
168-
&& chmod +x /usr/local/bin/deep_learning_container.py
264+
# Install JAX & Neuron CC
265+
RUN ${PIP} install --no-cache-dir --force-reinstall \
266+
neuronx-cc==$NEURONX_CC_VERSION \
267+
jax-neuronx==$NEURONX_JAX_TRAINING_VERSION \
268+
&& rm -rf ~/.cache/pip/*
169269

170-
RUN HOME_DIR=/root \
171-
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
172-
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
173-
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
174-
&& chmod +x /usr/local/bin/testOSSCompliance \
175-
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
176-
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
177-
&& rm -rf ${HOME_DIR}/oss_compliance* \
178-
&& rm -rf /tmp/tmp*
270+
FROM ${BUILD_STAGE} AS final
179271

180272
# Starts framework
181273
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]

0 commit comments

Comments
 (0)