1
- FROM public.ecr.aws/docker/library/ubuntu:20 .04
1
+ FROM public.ecr.aws/docker/library/ubuntu:22 .04
2
2
3
- LABEL maintainer="Amazon AI"
4
3
LABEL dlc_major_version="1"
4
+ LABEL maintainer="Amazon AI"
5
5
6
6
# Neuron SDK components version numbers
7
- ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.16.0
8
- ARG NEURONX_DISTRIBUTED_VERSION=0.9.0
9
- ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.0.1
10
- ARG NEURONX_CC_VERSION=2.15.143.0
11
- ARG NEURONX_COLLECTIVES_LIB_VERSION=2.22.33.0-d2128d1aa
12
- ARG NEURONX_RUNTIME_LIB_VERSION=2.22.19.0-5856c0b42
13
- ARG NEURONX_TOOLS_VERSION=2.19.0.0
7
+ ARG NEURONX_RUNTIME_LIB_VERSION=2.23.110.0-9b5179492
8
+ ARG NEURONX_COLLECTIVES_LIB_VERSION=2.23.133.0-3e70920f2
9
+ ARG NEURONX_TOOLS_VERSION=2.20.204.0
10
+ ARG NEURONX_CC_VERSION=2.16.345.0
11
+ ARG NEURONX_JAX_TRAINING_VERSION=0.1.2
14
12
15
13
ARG PYTHON=python3.10
16
14
ARG PYTHON_VERSION=3.10.12
17
15
ARG PIP=pip3
18
16
ARG OMPI_VERSION=4.1.5
19
17
20
- # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
18
+ # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
21
19
ARG DEBIAN_FRONTEND=noninteractive
22
20
23
21
# Python won’t try to write .pyc or .pyo files on the import of source modules
@@ -32,9 +30,6 @@ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
32
30
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
33
31
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
34
32
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
35
- ENV PATH /opt/aws/neuron/bin/:$PATH
36
- ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
37
- ENV DGLBACKEND=pytorch
38
33
39
34
RUN apt-get update \
40
35
&& apt-get upgrade -y \
@@ -45,45 +40,35 @@ RUN apt-get update \
45
40
curl \
46
41
emacs \
47
42
git \
43
+ gnupg2 \
44
+ gpg-agent \
48
45
jq \
49
46
libopencv-dev \
50
- openjdk-8-jdk-headless \
51
- openjdk-8-jdk \
52
- openjdk-8-jre \
53
47
libglib2.0-0 \
54
48
libgl1-mesa-glx \
55
49
libsm6 \
56
50
libxext6 \
57
51
libxrender-dev \
58
- openjdk-11-jdk \
59
- software-properties-common \
60
- wget \
61
- unzip \
62
- vim \
63
- zlib1g-dev \
64
- openssl \
65
52
libssl-dev \
66
53
libsqlite3-dev \
67
54
libgdbm-dev \
68
55
libc6-dev \
69
56
libbz2-dev \
70
57
libncurses-dev \
71
- tk-dev \
72
58
libffi-dev \
73
59
libcap-dev \
74
- gnupg2 \
75
- gpg-agent \
76
- && rm -rf /var/lib/apt/lists/* \
77
- && apt-get clean
78
-
79
- RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
80
- RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
81
-
82
- RUN apt-get update \
83
- && apt-get install -y \
84
- aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
85
- aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
86
- aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
60
+ libhwloc-dev \
61
+ openjdk-8-jdk-headless \
62
+ openjdk-8-jdk \
63
+ openjdk-8-jre \
64
+ openjdk-11-jdk \
65
+ openssl \
66
+ software-properties-common \
67
+ tk-dev \
68
+ unzip \
69
+ wget \
70
+ vim \
71
+ zlib1g-dev \
87
72
&& rm -rf /var/lib/apt/lists/* \
88
73
&& rm -rf /tmp/tmp* \
89
74
&& apt-get clean
@@ -100,6 +85,15 @@ RUN mkdir -p /tmp/openmpi \
100
85
&& ldconfig \
101
86
&& rm -rf /tmp/openmpi
102
87
88
+ # Install packages and configure SSH for MPI operator in k8s
89
+ RUN apt-get update && apt-get install -y openmpi-bin openssh-server \
90
+ && mkdir -p /var/run/sshd \
91
+ && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
92
+ && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
93
+ && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
94
+ && rm -rf /var/lib/apt/lists/* \
95
+ && apt-get clean
96
+
103
97
# install Python
104
98
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
105
99
&& tar -xzf Python-$PYTHON_VERSION.tgz \
@@ -122,76 +116,31 @@ ENV PATH="$PATH:/home/.openmpi/bin"
122
116
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
123
117
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
124
118
125
- # Copy workaround script for incorrect hostname
126
- COPY changehostname.c /
127
- COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
119
+ RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
128
120
129
- RUN ${PIP} install --no-cache-dir -U \
130
- "bokeh>=2.3,<3" \
131
- "awscli<2" \
132
- scipy \
133
- click \
134
- "cryptography" \
135
- "sagemaker>=2,<2.184" \
136
- "sagemaker-pytorch-training" \
137
- psutil==5.6.7 \
138
- dataset \
139
- transformers==4.36.2 \
140
- Pillow
121
+ # Install Neuron Driver, Runtime and Tools
122
+ RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
123
+ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
141
124
142
- RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
125
+ RUN apt-get update \
126
+ && apt-get install -y \
127
+ aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
128
+ aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
129
+ aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
130
+ && rm -rf /var/lib/apt/lists/* \
131
+ && rm -rf /tmp/tmp* \
132
+ && apt-get clean
133
+
134
+ # Add Neuron PATH
135
+ ENV PATH="/opt/aws/neuron/bin:${PATH}"
136
+
137
+ # Install AWS CLI
138
+ RUN ${PIP} install --no-cache-dir -U "awscli<2"
139
+
140
+ # Install JAX & Neuron CC
143
141
RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \
144
- && ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
145
- && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
146
-
147
- RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
148
-
149
- ## Installation for Neuronx Distributed Training framework
150
- # Install Cython
151
- RUN pip install --no-cache-dir Cython
152
-
153
- # Copy the apex_setup.py file
154
- COPY apex_setup.py /root/apex_setup.py
155
-
156
- # Clone and build Apex
157
- RUN git clone https://github.com/NVIDIA/apex.git /root/apex \
158
- && cd /root/apex \
159
- && git checkout 23.05 \
160
- && cp /root/apex_setup.py setup.py \
161
- && python3 setup.py bdist_wheel
162
-
163
- #Install dependencies from requirements and extras for SageMaker usecase
164
- RUN wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/master/requirements.txt \
165
- && pip install --no-cache-dir -r requirements.txt /root/apex/dist/apex-0.1-py3-none-any.whl \
166
- && pip install --force-reinstall "multiprocess==0.70.16" \
167
- "dill==0.3.8" \
168
- "torch==1.13.1"
169
-
170
-
171
- RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed_training==$NEURONX_DISTRIBUTED_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
172
-
173
- # attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0
174
- # protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3
175
- # awscli 1.25.47 has requirement docutils<0.17,>=0.10
176
- # etcd for kubernetes installation
177
- # awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9.
178
- # awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2
179
- RUN ${PIP} install --no-cache-dir -U \
180
- "attrs<24,>=23.1.0" \
181
- "protobuf>=3.18.3,<=3.20.3" \
182
- "docutils>=0.10,<0.17" \
183
- "rsa<4.8,>=3.1.2" \
184
- "python-etcd" \
185
- "urllib3>=1.26.0,<1.27"
186
-
187
- # Install extra packages needed by sagemaker (for passing test_utility_packages_using_import)
188
- RUN pip install --no-cache-dir -U \
189
- "bokeh>=3.0.1,<4" \
190
- "imageio>=2.22,<3" \
191
- "opencv-python>=4.8.1.78" \
192
- "plotly>=5.11,<6" \
193
- "seaborn>=0.12,<1" \
194
- "shap>=0.41,<1"
142
+ && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
143
+ && ${PIP} install --force-reinstall jax-neuronx==$NEURONX_JAX_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
195
144
196
145
# EFA Installer does apt get. Make sure to run apt update before that
197
146
RUN apt-get update
@@ -205,27 +154,14 @@ RUN cd $HOME \
205
154
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
206
155
&& cd $HOME
207
156
208
-
209
157
# Clean up after apt update
210
158
RUN rm -rf /var/lib/apt/lists/* \
211
159
&& rm -rf /tmp/tmp* \
212
160
&& apt-get clean
213
161
214
- # Install some common packages used by training scripts
215
- # torchvision needed for MLP. since it depends on torch and torch neuron/torch
216
- # is already installed install it with nodeps
217
- RUN pip3 install --no-cache-dir --no-deps -U \
218
- torchvision==0.14.*
219
-
220
- # Needed for running bert training scripts
221
- RUN pip3 install --no-cache-dir -U \
222
- graphviz \
223
- tensorboard==2.6 \
224
- accelerate \
225
- sentencepiece!=0.1.92 \
226
- h5py \
227
- requests
228
-
162
+ # Copy workaround script for incorrect hostname
163
+ COPY changehostname.c /
164
+ COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
229
165
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
230
166
231
167
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \
@@ -241,8 +177,8 @@ RUN HOME_DIR=/root \
241
177
&& rm -rf ${HOME_DIR}/oss_compliance* \
242
178
&& rm -rf /tmp/tmp*
243
179
244
- RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.13/license.txt
245
-
246
180
# Starts framework
247
181
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
248
182
CMD ["/bin/bash"]
183
+
184
+ HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
0 commit comments