1
- FROM public.ecr.aws/docker/library/ubuntu:22.04
1
+ ARG BUILD_STAGE=prod
2
+
3
+ FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base
2
4
3
5
LABEL dlc_major_version="1"
4
6
LABEL maintainer="Amazon AI"
5
7
6
- # Neuron SDK components version numbers
7
- ARG NEURONX_RUNTIME_LIB_VERSION=2.24.53.0-f239092cc
8
- ARG NEURONX_COLLECTIVES_LIB_VERSION=2.24.59.0-838c7fc8b
9
- ARG NEURONX_TOOLS_VERSION=2.22.61.0
10
- ARG NEURONX_CC_VERSION=2.17.194.0
11
- ARG NEURONX_JAX_TRAINING_VERSION=0.1.3
12
-
8
+ # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
9
+ ARG DEBIAN_FRONTEND=noninteractive
13
10
ARG PYTHON=python3.10
14
11
ARG PYTHON_VERSION=3.10.12
15
12
ARG PIP=pip3
16
13
ARG OMPI_VERSION=4.1.5
17
14
18
- # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
19
- ARG DEBIAN_FRONTEND=noninteractive
20
-
21
15
# Python won’t try to write .pyc or .pyo files on the import of source modules
22
16
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
23
17
ENV PYTHONDONTWRITEBYTECODE=1
@@ -30,6 +24,7 @@ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
30
24
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
31
25
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
32
26
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
27
+ ENV PATH="/opt/aws/neuron/bin:${PATH}"
33
28
34
29
RUN apt-get update \
35
30
&& apt-get upgrade -y \
@@ -86,15 +81,17 @@ RUN mkdir -p /tmp/openmpi \
86
81
&& rm -rf /tmp/openmpi
87
82
88
83
# Install packages and configure SSH for MPI operator in k8s
89
- RUN apt-get update && apt-get install -y openmpi-bin openssh-server \
84
+ RUN apt-get update \
85
+ && apt-get install -y openmpi-bin openssh-server \
90
86
&& mkdir -p /var/run/sshd \
91
87
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
92
88
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
93
89
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
94
90
&& rm -rf /var/lib/apt/lists/* \
91
+ && rm -rf /tmp/tmp* \
95
92
&& apt-get clean
96
93
97
- # install Python
94
+ # Install Python
98
95
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
99
96
&& tar -xzf Python-$PYTHON_VERSION.tgz \
100
97
&& cd Python-$PYTHON_VERSION \
@@ -104,8 +101,26 @@ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VER
104
101
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
105
102
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
106
103
&& ${PIP} --no-cache-dir install --upgrade \
104
+ "awscli<2" \
107
105
pip \
108
- setuptools
106
+ requests \
107
+ setuptools \
108
+ && rm -rf ~/.cache/pip/*
109
+
110
+ # Install EFA
111
+ RUN apt-get update \
112
+ && cd $HOME \
113
+ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
114
+ && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
115
+ && cat aws-efa-installer.key | gpg --fingerprint \
116
+ && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
117
+ && tar -xf aws-efa-installer-latest.tar.gz \
118
+ && cd aws-efa-installer \
119
+ && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \
120
+ && cd $HOME \
121
+ && rm -rf /var/lib/apt/lists/* \
122
+ && rm -rf /tmp/tmp* \
123
+ && apt-get clean
109
124
110
125
WORKDIR /
111
126
@@ -118,64 +133,141 @@ RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
118
133
119
134
RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
120
135
121
- # Install Neuron Driver, Runtime and Tools
122
- RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
123
- RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
136
+ # Copy workaround script for incorrect hostname
137
+ COPY changehostname.c /
138
+ COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/
124
139
125
- RUN apt-get update \
126
- && apt-get install -y \
127
- aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
128
- aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
129
- aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
140
+ RUN HOME_DIR=/root \
141
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
142
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
143
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
144
+ && chmod +x /usr/local/bin/testOSSCompliance \
145
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
146
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
147
+ && rm -rf ${HOME_DIR}/oss_compliance* \
148
+ && rm -rf /tmp/tmp*
149
+
150
+ # Setting up APT and PIP repo for neuron artifacts
151
+ ARG NEURON_APT_REPO=https://apt.repos.neuron.amazonaws.com
152
+ ARG NEURON_APT_REPO_KEY
153
+ ARG NEURON_PIP_REPO=https://pip.repos.neuron.amazonaws.com
154
+ ARG NEURON_PIP_REPO_KEY
155
+ RUN mkdir -p /etc/apt/keyrings \
156
+ && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
157
+ && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \
158
+ && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \
159
+ && PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
160
+ && ${PIP} config set global.extra-index-url "${PIP_REPO_URL}"
161
+
162
+ # Neuron SDK components version numbers
163
+ ARG NEURON_ARTIFACT_PATH=/root/neuron_artifacts
164
+ ARG IGNORE_MISSING_NEURON_COMPONENTS=false
165
+ RUN IGNORE_MISSING_NEURON_COMPONENTS=$(echo ${IGNORE_MISSING_NEURON_COMPONENTS} | tr '[:upper:]' '[:lower:]')
166
+
167
+ ARG NEURONX_RUNTIME_LIB_VERSION=2.25.57.0-166c7a468
168
+ ARG NEURONX_COLLECTIVES_LIB_VERSION=2.25.65.0-9858ac9a1
169
+ ARG NEURONX_TOOLS_VERSION=2.23.9.0
170
+
171
+ ARG NEURONX_CC_VERSION=2.18.121.0
172
+ ARG NEURONX_JAX_TRAINING_VERSION=0.5.3.1.0.719+1d9c17be
173
+
174
+ FROM base AS dev
175
+
176
+ RUN --mount=type=bind,source=apt,target=${NEURON_ARTIFACT_PATH}/apt \
177
+ install_apt_package() { \
178
+ pkg_name=$1; \
179
+ version_arg=$2; \
180
+ if [ -f "${NEURON_ARTIFACT_PATH}/apt/${version_arg}" ]; then \
181
+ apt-get install -y ${NEURON_ARTIFACT_PATH}/apt/${version_arg}; \
182
+ elif [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \
183
+ apt-get install -y ${pkg_name}=${version_arg}; \
184
+ else \
185
+ echo "Ignoring package ${pkg_name}"; \
186
+ fi; \
187
+ } \
188
+ && apt-get update \
189
+ && install_apt_package "aws-neuronx-collectives" "${NEURONX_COLLECTIVES_LIB_VERSION}" \
190
+ && install_apt_package "aws-neuronx-runtime-lib" "${NEURONX_RUNTIME_LIB_VERSION}" \
191
+ && install_apt_package "aws-neuronx-tools" "${NEURONX_TOOLS_VERSION}" \
130
192
&& rm -rf /var/lib/apt/lists/* \
131
193
&& rm -rf /tmp/tmp* \
132
194
&& apt-get clean
133
195
134
- # Add Neuron PATH
135
- ENV PATH="/opt/aws/neuron/bin:${PATH}"
196
+ RUN --mount=type=bind,source=pip,target=${NEURON_ARTIFACT_PATH}/pip \
197
+ install_pip_package() { \
198
+ packages=""; \
199
+ flags=""; \
200
+ while [ "$#" -gt 0 ]; do \
201
+ pkg_name=$(echo $1 | cut -d: -f1); \
202
+ version_arg=$(echo $1 | cut -d: -f2); \
203
+ extra_flags=$(echo $1 | cut -d: -f3); \
204
+ if [ -f "${NEURON_ARTIFACT_PATH}/pip/${version_arg}" ]; then \
205
+ packages="${packages} ${NEURON_ARTIFACT_PATH}/pip/${version_arg}"; \
206
+ else \
207
+ if [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \
208
+ packages="${packages} ${pkg_name}==${version_arg}"; \
209
+ else \
210
+ echo "Ignoring package ${pkg_name}"; \
211
+ fi; \
212
+ fi; \
213
+ # Store unique flags
214
+ if [ ! -z "${extra_flags}" ]; then \
215
+ for flag in $(echo "${extra_flags}" | tr ' ' '\n'); do \
216
+ case " ${flags} " in \
217
+ *" ${flag} "*) ;; \
218
+ *) flags="${flags} ${flag}" ;; \
219
+ esac \
220
+ done; \
221
+ fi; \
222
+ shift; \
223
+ done; \
224
+ if [ ! -z "${packages}" ]; then \
225
+ echo "Installing packages: ${packages} with flags ${flags}"; \
226
+ ${PIP} install --no-cache-dir --force-reinstall \
227
+ --extra-index-url="file:///${NEURON_ARTIFACT_PATH}/pip" \
228
+ ${packages} ${flags}; \
229
+ fi; \
230
+ } \
231
+ && install_pip_package "neuronx-cc:${NEURONX_CC_VERSION}:" "jax-neuronx:${NEURONX_JAX_TRAINING_VERSION}:" \
232
+ && rm -rf ~/.cache/pip/*
136
233
137
- # Install AWS CLI
138
- RUN ${PIP} install --no-cache-dir -U "awscli<2"
234
+ FROM base AS repo
139
235
140
- # Install JAX & Neuron CC
141
- RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \
142
- && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
143
- && ${PIP} install --force-reinstall jax-neuronx==$NEURONX_JAX_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
236
+ # Install Neuron components from the apt and pip repos
237
+ RUN apt-get update \
238
+ && apt-get install -y \
239
+ aws-neuronx-tools \
240
+ aws-neuronx-collectives \
241
+ aws-neuronx-runtime-lib \
242
+ && rm -rf /var/lib/apt/lists/* \
243
+ && rm -rf /tmp/tmp* \
244
+ && apt-get clean \
245
+ && ${PIP} install --no-cache-dir --force-reinstall \
246
+ neuronx-cc \
247
+ jax-neuronx \
248
+ && rm -rf ~/.cache/pip/*
144
249
145
- # EFA Installer does apt get. Make sure to run apt update before that
146
- RUN apt-get update
147
- RUN cd $HOME \
148
- && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
149
- && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
150
- && cat aws-efa-installer.key | gpg --fingerprint \
151
- && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
152
- && tar -xf aws-efa-installer-latest.tar.gz \
153
- && cd aws-efa-installer \
154
- && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
155
- && cd $HOME
156
250
157
- # Clean up after apt update
158
- RUN rm -rf /var/lib/apt/lists/* \
251
+ FROM base AS prod
252
+
253
+ # Install Neuron components
254
+ # Install Neuron Driver, Runtime and Tools
255
+ RUN apt-get update \
256
+ && apt-get install -y \
257
+ aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
258
+ aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
259
+ aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
260
+ && rm -rf /var/lib/apt/lists/* \
159
261
&& rm -rf /tmp/tmp* \
160
262
&& apt-get clean
161
263
162
- # Copy workaround script for incorrect hostname
163
- COPY changehostname.c /
164
- COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
165
- COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
166
-
167
- RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \
168
- && chmod +x /usr/local/bin/deep_learning_container.py
264
+ # Install JAX & Neuron CC
265
+ RUN ${PIP} install --no-cache-dir --force-reinstall \
266
+ neuronx-cc==$NEURONX_CC_VERSION \
267
+ jax-neuronx==$NEURONX_JAX_TRAINING_VERSION \
268
+ && rm -rf ~/.cache/pip/*
169
269
170
- RUN HOME_DIR=/root \
171
- && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
172
- && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
173
- && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
174
- && chmod +x /usr/local/bin/testOSSCompliance \
175
- && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
176
- && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
177
- && rm -rf ${HOME_DIR}/oss_compliance* \
178
- && rm -rf /tmp/tmp*
270
+ FROM ${BUILD_STAGE} AS final
179
271
180
272
# Starts framework
181
273
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
0 commit comments