Skip to content

Commit 315b77b

Browse files
authored
Dockerfile update for Release 2.21 (#31)
*Description of changes:* * Addition JAX NeuronX Dockerfile * Removal of PyTorch 1.13 and 2.1 Dockerfiles By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
1 parent 947ce54 commit 315b77b

12 files changed

+358
-982
lines changed

.github/workflows/ci.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,6 @@ jobs:
1818
HADOLINT_RECURSIVE: "true"
1919
steps:
2020
- uses: actions/checkout@v4
21-
- uses: hadolint/hadolint-action@v3.1.0
22-
with:
23-
dockerfile: Dockerfile.neuron
24-
recursive: true
25-
failure-threshold: error # TODO: enable more linter rules other than error.
2621
- uses: hadolint/hadolint-action@v3.1.0
2722
with:
2823
dockerfile: Dockerfile.neuronx
Lines changed: 58 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,21 @@
1-
FROM public.ecr.aws/docker/library/ubuntu:20.04
1+
FROM public.ecr.aws/docker/library/ubuntu:22.04
22

3-
LABEL maintainer="Amazon AI"
43
LABEL dlc_major_version="1"
4+
LABEL maintainer="Amazon AI"
55

66
# Neuron SDK components version numbers
7-
ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.16.0
8-
ARG NEURONX_DISTRIBUTED_VERSION=0.9.0
9-
ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.0.1
10-
ARG NEURONX_CC_VERSION=2.15.143.0
11-
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.22.33.0-d2128d1aa
12-
ARG NEURONX_RUNTIME_LIB_VERSION=2.22.19.0-5856c0b42
13-
ARG NEURONX_TOOLS_VERSION=2.19.0.0
7+
ARG NEURONX_RUNTIME_LIB_VERSION=2.23.110.0-9b5179492
8+
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.23.133.0-3e70920f2
9+
ARG NEURONX_TOOLS_VERSION=2.20.204.0
10+
ARG NEURONX_CC_VERSION=2.16.345.0
11+
ARG NEURONX_JAX_TRAINING_VERSION=0.1.2
1412

1513
ARG PYTHON=python3.10
1614
ARG PYTHON_VERSION=3.10.12
1715
ARG PIP=pip3
1816
ARG OMPI_VERSION=4.1.5
1917

20-
# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
18+
# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
2119
ARG DEBIAN_FRONTEND=noninteractive
2220

2321
# Python won’t try to write .pyc or .pyo files on the import of source modules
@@ -32,9 +30,6 @@ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
3230
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
3331
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
3432
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
35-
ENV PATH /opt/aws/neuron/bin/:$PATH
36-
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
37-
ENV DGLBACKEND=pytorch
3833

3934
RUN apt-get update \
4035
&& apt-get upgrade -y \
@@ -45,45 +40,35 @@ RUN apt-get update \
4540
curl \
4641
emacs \
4742
git \
43+
gnupg2 \
44+
gpg-agent \
4845
jq \
4946
libopencv-dev \
50-
openjdk-8-jdk-headless \
51-
openjdk-8-jdk \
52-
openjdk-8-jre \
5347
libglib2.0-0 \
5448
libgl1-mesa-glx \
5549
libsm6 \
5650
libxext6 \
5751
libxrender-dev \
58-
openjdk-11-jdk \
59-
software-properties-common \
60-
wget \
61-
unzip \
62-
vim \
63-
zlib1g-dev \
64-
openssl \
6552
libssl-dev \
6653
libsqlite3-dev \
6754
libgdbm-dev \
6855
libc6-dev \
6956
libbz2-dev \
7057
libncurses-dev \
71-
tk-dev \
7258
libffi-dev \
7359
libcap-dev \
74-
gnupg2 \
75-
gpg-agent \
76-
&& rm -rf /var/lib/apt/lists/* \
77-
&& apt-get clean
78-
79-
RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
80-
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
81-
82-
RUN apt-get update \
83-
&& apt-get install -y \
84-
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
85-
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
86-
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
60+
libhwloc-dev \
61+
openjdk-8-jdk-headless \
62+
openjdk-8-jdk \
63+
openjdk-8-jre \
64+
openjdk-11-jdk \
65+
openssl \
66+
software-properties-common \
67+
tk-dev \
68+
unzip \
69+
wget \
70+
vim \
71+
zlib1g-dev \
8772
&& rm -rf /var/lib/apt/lists/* \
8873
&& rm -rf /tmp/tmp* \
8974
&& apt-get clean
@@ -100,6 +85,15 @@ RUN mkdir -p /tmp/openmpi \
10085
&& ldconfig \
10186
&& rm -rf /tmp/openmpi
10287

88+
# Install packages and configure SSH for MPI operator in k8s
89+
RUN apt-get update && apt-get install -y openmpi-bin openssh-server \
90+
&& mkdir -p /var/run/sshd \
91+
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
92+
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
93+
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
94+
&& rm -rf /var/lib/apt/lists/* \
95+
&& apt-get clean
96+
10397
# install Python
10498
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
10599
&& tar -xzf Python-$PYTHON_VERSION.tgz \
@@ -122,76 +116,31 @@ ENV PATH="$PATH:/home/.openmpi/bin"
122116
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
123117
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
124118

125-
# Copy workaround script for incorrect hostname
126-
COPY changehostname.c /
127-
COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
119+
RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
128120

129-
RUN ${PIP} install --no-cache-dir -U \
130-
"bokeh>=2.3,<3" \
131-
"awscli<2" \
132-
scipy \
133-
click \
134-
"cryptography" \
135-
"sagemaker>=2,<2.184" \
136-
"sagemaker-pytorch-training" \
137-
psutil==5.6.7 \
138-
dataset \
139-
transformers==4.36.2 \
140-
Pillow
121+
# Install Neuron Driver, Runtime and Tools
122+
RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
123+
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
141124

142-
RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
125+
RUN apt-get update \
126+
&& apt-get install -y \
127+
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
128+
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
129+
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
130+
&& rm -rf /var/lib/apt/lists/* \
131+
&& rm -rf /tmp/tmp* \
132+
&& apt-get clean
133+
134+
# Add Neuron PATH
135+
ENV PATH="/opt/aws/neuron/bin:${PATH}"
136+
137+
# Install AWS CLI
138+
RUN ${PIP} install --no-cache-dir -U "awscli<2"
139+
140+
# Install JAX & Neuron CC
143141
RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \
144-
&& ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
145-
&& ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
146-
147-
RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
148-
149-
## Installation for Neuronx Distributed Training framework
150-
# Install Cython
151-
RUN pip install --no-cache-dir Cython
152-
153-
# Copy the apex_setup.py file
154-
COPY apex_setup.py /root/apex_setup.py
155-
156-
# Clone and build Apex
157-
RUN git clone https://github.com/NVIDIA/apex.git /root/apex \
158-
&& cd /root/apex \
159-
&& git checkout 23.05 \
160-
&& cp /root/apex_setup.py setup.py \
161-
&& python3 setup.py bdist_wheel
162-
163-
#Install dependencies from requirements and extras for SageMaker usecase
164-
RUN wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/master/requirements.txt \
165-
&& pip install --no-cache-dir -r requirements.txt /root/apex/dist/apex-0.1-py3-none-any.whl \
166-
&& pip install --force-reinstall "multiprocess==0.70.16" \
167-
"dill==0.3.8" \
168-
"torch==1.13.1"
169-
170-
171-
RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed_training==$NEURONX_DISTRIBUTED_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
172-
173-
# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0
174-
# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3
175-
# awscli 1.25.47 has requirement docutils<0.17,>=0.10
176-
# etcd for kubernetes installation
177-
# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9.
178-
# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2
179-
RUN ${PIP} install --no-cache-dir -U \
180-
"attrs<24,>=23.1.0" \
181-
"protobuf>=3.18.3,<=3.20.3" \
182-
"docutils>=0.10,<0.17" \
183-
"rsa<4.8,>=3.1.2" \
184-
"python-etcd" \
185-
"urllib3>=1.26.0,<1.27"
186-
187-
# Install extra packages needed by sagemaker (for passing test_utility_packages_using_import)
188-
RUN pip install --no-cache-dir -U \
189-
"bokeh>=3.0.1,<4" \
190-
"imageio>=2.22,<3" \
191-
"opencv-python>=4.8.1.78" \
192-
"plotly>=5.11,<6" \
193-
"seaborn>=0.12,<1" \
194-
"shap>=0.41,<1"
142+
&& ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
143+
&& ${PIP} install --force-reinstall jax-neuronx==$NEURONX_JAX_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
195144

196145
# EFA Installer does apt get. Make sure to run apt update before that
197146
RUN apt-get update
@@ -205,27 +154,14 @@ RUN cd $HOME \
205154
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
206155
&& cd $HOME
207156

208-
209157
# Clean up after apt update
210158
RUN rm -rf /var/lib/apt/lists/* \
211159
&& rm -rf /tmp/tmp* \
212160
&& apt-get clean
213161

214-
# Install some common packages used by training scripts
215-
# torchvision needed for MLP. since it depends on torch and torch neuron/torch
216-
# is already installed install it with nodeps
217-
RUN pip3 install --no-cache-dir --no-deps -U \
218-
torchvision==0.14.*
219-
220-
# Needed for running bert training scripts
221-
RUN pip3 install --no-cache-dir -U \
222-
graphviz \
223-
tensorboard==2.6 \
224-
accelerate \
225-
sentencepiece!=0.1.92 \
226-
h5py \
227-
requests
228-
162+
# Copy workaround script for incorrect hostname
163+
COPY changehostname.c /
164+
COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
229165
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
230166

231167
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \
@@ -241,8 +177,8 @@ RUN HOME_DIR=/root \
241177
&& rm -rf ${HOME_DIR}/oss_compliance* \
242178
&& rm -rf /tmp/tmp*
243179

244-
RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.13/license.txt
245-
246180
# Starts framework
247181
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
248182
CMD ["/bin/bash"]
183+
184+
HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"CVE-2024-35195": {
3+
"description": "Requests is a HTTP library. Prior to 2.32.0, when making requests through a Requests `Session`, if the first request is made with `verify=False` to disable cert verification, all subsequent requests to the same host will continue to ignore cert verification regardless of changes to the value of `verify`. This behavior will continue for the lifecycle of the connection in the connection pool. This vulnerability is fixed in 2.32.0.",
4+
"remediation": {
5+
"recommendation": {
6+
"text": "None Provided"
7+
}
8+
},
9+
"score": 0.0,
10+
"score_details": {},
11+
"severity": "UNTRIAGED",
12+
"source": "NVD",
13+
"source_url": "https://nvd.nist.gov/vuln/detail/CVE-2024-35195",
14+
"status": "ACTIVE",
15+
"title": "CVE-2024-35195 - requests",
16+
"vulnerability_id": "CVE-2024-35195",
17+
"vulnerable_packages": [
18+
{
19+
"epoch": 0,
20+
"filePath": "usr/local/lib/python3.10/site-packages/requests-2.31.0.dist-info/METADATA",
21+
"name": "requests",
22+
"packageManager": "PYTHONPKG",
23+
"version": "2.31.0"
24+
}
25+
]
26+
}
27+
}

0 commit comments

Comments
 (0)