Skip to content

Updated Dockerfiles #101

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions docker/common/apex_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,32 @@
import subprocess

import torch
from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME, load
from torch.utils.cpp_extension import (
BuildExtension,
CppExtension,
CUDAExtension,
CUDA_HOME,
load,
)

setup(
name="apex",
version="0.1",
packages=find_packages(
exclude=("build", "csrc", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info",)
exclude=(
"build",
"csrc",
"include",
"tests",
"dist",
"docs",
"tests",
"examples",
"apex.egg-info",
)
),
install_requires=["packaging>20.6",],
install_requires=[
"packaging>20.6",
],
description="PyTorch Extensions written by NVIDIA",
)
)
18 changes: 18 additions & 0 deletions docker/common/nxdt_install_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash

set -o pipefail
set -e

# Install megatron-core
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout core_r0.10.0
pip install .
cd megatron/core/datasets
make

VENV_PATH=$(pip show nemo_toolkit | grep Location | awk '{print $2}')
# Remove call to get_megatron_pretrained_bert_models() as it uses Transformer Engine which we don't support
sed -i 's/get_megatron_pretrained_bert_models()/[]/g' $VENV_PATH/nemo/collections/nlp/models/nlp_model.py
# Remove filepath checking as it could be an S3 path when S3 checkpointing
sed -i 's/ and self\._fs\.exists(ckpt_to_dir(filepath))//g' $VENV_PATH/nemo/utils/callbacks/nemo_model_checkpoint.py
47 changes: 47 additions & 0 deletions docker/common/nxdt_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
hydra-core>=1.2.0,<1.3
omegaconf>=2.2,<2.3
pyyaml==6.0.1
torchmetrics>=0.4.1rc0,<=0.10.3
trl==0.10.1
transformers==4.48.0
wandb
webdataset>=0.1.48,<=0.1.62
pandas
sentencepiece
youtokentome
h5py
ijson
matplotlib>=3.3.2
sacremoses
sacrebleu
einops
faiss-cpu
sentence_transformers>=2.3
nltk>=3.6.5
jieba
ftfy
gdown
inflect
jieba
opencc==1.1.6
pangu
rapidfuzz
pybind11
pytorch-lightning==2.5.0
ipadic
mecab-python3
protobuf==3.20.3
datasets==2.19.1
dill==0.3.8
nemo_toolkit==2.1.0
regex
requests<2.32.0
python-daemon
huggingface_hub>=0.27.1
multiprocess==0.70.16
numba<=0.60.0
numpy>=1.24.3,<=1.25.2
rouge_score
setuptools>=70.0
lightning==2.5.0
ml-dtypes==0.2.0
210 changes: 151 additions & 59 deletions docker/jax/training/0.5/Dockerfile.neuronx
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
FROM public.ecr.aws/docker/library/ubuntu:22.04
ARG BUILD_STAGE=prod

FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base

LABEL dlc_major_version="1"

Check failure on line 5 in docker/jax/training/0.5/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3048 style: Invalid label key.
LABEL maintainer="Amazon AI"

# Neuron SDK components version numbers
ARG NEURONX_RUNTIME_LIB_VERSION=2.24.53.0-f239092cc
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.24.59.0-838c7fc8b
ARG NEURONX_TOOLS_VERSION=2.22.61.0
ARG NEURONX_CC_VERSION=2.17.194.0
ARG NEURONX_JAX_TRAINING_VERSION=0.1.3

# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
ARG DEBIAN_FRONTEND=noninteractive
ARG PYTHON=python3.10
ARG PYTHON_VERSION=3.10.12
ARG PIP=pip3
ARG OMPI_VERSION=4.1.5

# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
ARG DEBIAN_FRONTEND=noninteractive

# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
ENV PYTHONDONTWRITEBYTECODE=1
Expand All @@ -30,8 +24,9 @@
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
ENV PATH="/opt/aws/neuron/bin:${PATH}"

RUN apt-get update \

Check failure on line 29 in docker/jax/training/0.5/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3008 warning: Pin versions in apt get install. Instead of `apt-get install <package>` use `apt-get install <package>=<version>`
&& apt-get upgrade -y \
&& apt-get install -y --no-install-recommends \
build-essential \
Expand Down Expand Up @@ -74,7 +69,7 @@
&& apt-get clean

# Install Open MPI
RUN mkdir -p /tmp/openmpi \

Check failure on line 72 in docker/jax/training/0.5/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

SC2046 warning: Quote this to prevent word splitting.

Check failure on line 72 in docker/jax/training/0.5/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3003 warning: Use WORKDIR to switch to a directory
&& cd /tmp/openmpi \
&& wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
&& tar zxf openmpi-${OMPI_VERSION}.tar.gz \
Expand All @@ -86,16 +81,18 @@
&& rm -rf /tmp/openmpi

# Install packages and configure SSH for MPI operator in k8s
RUN apt-get update && apt-get install -y openmpi-bin openssh-server \
RUN apt-get update \

Check failure on line 84 in docker/jax/training/0.5/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3008 warning: Pin versions in apt get install. Instead of `apt-get install <package>` use `apt-get install <package>=<version>`

Check failure on line 84 in docker/jax/training/0.5/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3015 info: Avoid additional packages by specifying `--no-install-recommends`
&& apt-get install -y openmpi-bin openssh-server \
&& mkdir -p /var/run/sshd \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# install Python
# Install Python
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \

Check failure on line 95 in docker/jax/training/0.5/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

SC2046 warning: Quote this to prevent word splitting.

Check failure on line 95 in docker/jax/training/0.5/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3003 warning: Use WORKDIR to switch to a directory
&& tar -xzf Python-$PYTHON_VERSION.tgz \
&& cd Python-$PYTHON_VERSION \
&& ./configure --enable-shared --prefix=/usr/local \
Expand All @@ -104,8 +101,26 @@
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
&& ${PIP} --no-cache-dir install --upgrade \
"awscli<2" \
pip \
setuptools
requests \
setuptools \
&& rm -rf ~/.cache/pip/*

# Install EFA
RUN apt-get update \

Check failure on line 111 in docker/jax/training/0.5/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3003 warning: Use WORKDIR to switch to a directory

Check failure on line 111 in docker/jax/training/0.5/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3047 info: Avoid use of wget without progress bar. Use `wget --progress=dot:giga <url>`. Or consider using `-q` or `-nv` (shorthands for `--quiet` or `--no-verbose`).
&& cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
&& cat aws-efa-installer.key | gpg --fingerprint \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
&& tar -xf aws-efa-installer-latest.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \
&& cd $HOME \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

WORKDIR /

Expand All @@ -118,64 +133,141 @@

RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt

# Install Neuron Driver, Runtime and Tools
RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
# Copy workaround script for incorrect hostname
COPY changehostname.c /
COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/

RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
RUN HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
&& rm -rf ${HOME_DIR}/oss_compliance* \
&& rm -rf /tmp/tmp*

# Setting up APT and PIP repo for neuron artifacts
ARG NEURON_APT_REPO=https://apt.repos.neuron.amazonaws.com
ARG NEURON_APT_REPO_KEY
ARG NEURON_PIP_REPO=https://pip.repos.neuron.amazonaws.com
ARG NEURON_PIP_REPO_KEY
RUN mkdir -p /etc/apt/keyrings \
&& APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
&& echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \
&& curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \
&& PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
&& ${PIP} config set global.extra-index-url "${PIP_REPO_URL}"

# Neuron SDK components version numbers
ARG NEURON_ARTIFACT_PATH=/root/neuron_artifacts
ARG IGNORE_MISSING_NEURON_COMPONENTS=false
RUN IGNORE_MISSING_NEURON_COMPONENTS=$(echo ${IGNORE_MISSING_NEURON_COMPONENTS} | tr '[:upper:]' '[:lower:]')

ARG NEURONX_RUNTIME_LIB_VERSION=2.25.57.0-166c7a468
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.25.65.0-9858ac9a1
ARG NEURONX_TOOLS_VERSION=2.23.9.0

ARG NEURONX_CC_VERSION=2.18.121.0
ARG NEURONX_JAX_TRAINING_VERSION=0.5.3.1.0.719+1d9c17be

FROM base AS dev

RUN --mount=type=bind,source=apt,target=${NEURON_ARTIFACT_PATH}/apt \
install_apt_package() { \
pkg_name=$1; \
version_arg=$2; \
if [ -f "${NEURON_ARTIFACT_PATH}/apt/${version_arg}" ]; then \
apt-get install -y ${NEURON_ARTIFACT_PATH}/apt/${version_arg}; \
elif [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \
apt-get install -y ${pkg_name}=${version_arg}; \
else \
echo "Ignoring package ${pkg_name}"; \
fi; \
} \
&& apt-get update \
&& install_apt_package "aws-neuronx-collectives" "${NEURONX_COLLECTIVES_LIB_VERSION}" \
&& install_apt_package "aws-neuronx-runtime-lib" "${NEURONX_RUNTIME_LIB_VERSION}" \
&& install_apt_package "aws-neuronx-tools" "${NEURONX_TOOLS_VERSION}" \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Add Neuron PATH
ENV PATH="/opt/aws/neuron/bin:${PATH}"
RUN --mount=type=bind,source=pip,target=${NEURON_ARTIFACT_PATH}/pip \
install_pip_package() { \
packages=""; \
flags=""; \
while [ "$#" -gt 0 ]; do \
pkg_name=$(echo $1 | cut -d: -f1); \
version_arg=$(echo $1 | cut -d: -f2); \
extra_flags=$(echo $1 | cut -d: -f3); \
if [ -f "${NEURON_ARTIFACT_PATH}/pip/${version_arg}" ]; then \
packages="${packages} ${NEURON_ARTIFACT_PATH}/pip/${version_arg}"; \
else \
if [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \
packages="${packages} ${pkg_name}==${version_arg}"; \
else \
echo "Ignoring package ${pkg_name}"; \
fi; \
fi; \
# Store unique flags
if [ ! -z "${extra_flags}" ]; then \
for flag in $(echo "${extra_flags}" | tr ' ' '\n'); do \
case " ${flags} " in \
*" ${flag} "*) ;; \
*) flags="${flags} ${flag}" ;; \
esac \
done; \
fi; \
shift; \
done; \
if [ ! -z "${packages}" ]; then \
echo "Installing packages: ${packages} with flags ${flags}"; \
${PIP} install --no-cache-dir --force-reinstall \
--extra-index-url="file:///${NEURON_ARTIFACT_PATH}/pip" \
${packages} ${flags}; \
fi; \
} \
&& install_pip_package "neuronx-cc:${NEURONX_CC_VERSION}:" "jax-neuronx:${NEURONX_JAX_TRAINING_VERSION}:" \
&& rm -rf ~/.cache/pip/*

# Install AWS CLI
RUN ${PIP} install --no-cache-dir -U "awscli<2"
FROM base AS repo

# Install JAX & Neuron CC
RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall jax-neuronx==$NEURONX_JAX_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
# Install Neuron components from the apt and pip repos
RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools \
aws-neuronx-collectives \
aws-neuronx-runtime-lib \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean \
&& ${PIP} install --no-cache-dir --force-reinstall \
neuronx-cc \
jax-neuronx \
&& rm -rf ~/.cache/pip/*

# EFA Installer does apt get. Make sure to run apt update before that
RUN apt-get update
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
&& cat aws-efa-installer.key | gpg --fingerprint \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
&& tar -xf aws-efa-installer-latest.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& cd $HOME

# Clean up after apt update
RUN rm -rf /var/lib/apt/lists/* \
FROM base AS prod

# Install Neuron components
# Install Neuron Driver, Runtime and Tools
RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Copy workaround script for incorrect hostname
COPY changehostname.c /
COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py

RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \
&& chmod +x /usr/local/bin/deep_learning_container.py
# Install JAX & Neuron CC
RUN ${PIP} install --no-cache-dir --force-reinstall \
neuronx-cc==$NEURONX_CC_VERSION \
jax-neuronx==$NEURONX_JAX_TRAINING_VERSION \
&& rm -rf ~/.cache/pip/*

RUN HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
&& rm -rf ${HOME_DIR}/oss_compliance* \
&& rm -rf /tmp/tmp*
FROM ${BUILD_STAGE} AS final

# Starts framework
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
Expand Down
Loading