|
| 1 | +ARG CUDA_VERSION_MINOR=12.4.1 |
| 2 | +ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION_MINOR}-devel-ubuntu22.04 |
| 3 | +FROM ${BASE_IMAGE} as base |
| 4 | + |
| 5 | +ARG CUDA_VERSION_MAJOR=12.4 |
| 6 | +ARG TARGET_NCCL_VERSION=2.21.5-1 |
| 7 | + |
| 8 | +ARG DEBIAN_FRONTEND=noninteractive |
| 9 | +RUN apt-get -qq update && \ |
| 10 | + apt-get -qq install -y \ |
| 11 | + --allow-change-held-packages \ |
| 12 | + --no-install-recommends \ |
| 13 | + --allow-downgrades \ |
| 14 | + build-essential libtool autoconf automake autotools-dev unzip \ |
| 15 | + ca-certificates \ |
| 16 | + wget curl openssh-server vim environment-modules \ |
| 17 | + iputils-ping net-tools \ |
| 18 | + libnuma1 libsubunit0 libpci-dev \ |
| 19 | + libpmix-dev \ |
| 20 | + datacenter-gpu-manager \ |
| 21 | + g++ libopenmpi-dev openmpi-bin \ |
| 22 | + libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \ |
| 23 | + libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \ |
| 24 | + git && \ |
| 25 | + apt-get clean && \ |
| 26 | + rm -rf /var/lib/apt/lists/* |
| 27 | + |
| 28 | +RUN apt-get -qq update \ |
| 29 | + && apt-get -qq install -y --no-install-recommends \ |
| 30 | + ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \ |
| 31 | + && rm -rf /var/lib/apt/lists/* |
| 32 | + |
| 33 | +RUN apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl && \ |
| 34 | + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \ |
| 35 | + echo "deb https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ |
| 36 | + apt-get update && apt-get install -y google-cloud-sdk && \ |
| 37 | + apt-get clean && rm -rf /var/lib/apt/lists/* |
| 38 | + |
| 39 | + |
| 40 | + |
| 41 | +# NCCL Tests |
| 42 | +ENV NCCL_TESTS_COMMITISH=c6afef0 |
| 43 | +ENV CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90' |
| 44 | +ENV CUDA12_PTX='-gencode=arch=compute_90,code=compute_90' |
| 45 | +WORKDIR /opt/nccl-tests |
| 46 | +RUN wget -q -O - https://github.com/NVIDIA/nccl-tests/archive/${NCCL_TESTS_COMMITISH}.tar.gz | tar --strip-components=1 -xzf - && \ |
| 47 | + mpicc -show && \ |
| 48 | + export CXX=mpic++ && \ |
| 49 | + make -j20 MPI=1 MPI_HOME=/usr/include/openmpi NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" && \ |
| 50 | + ln -s /opt/nccl-tests /opt/nccl_tests |
| 51 | + |
| 52 | +RUN ldconfig |
| 53 | + |
| 54 | +# SSH dependencies for MPI |
| 55 | +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ |
| 56 | + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ |
| 57 | + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ |
| 58 | + sed -i "s/[ #]\(.*Port \).*/ \12222/g" /etc/ssh/ssh_config && \ |
| 59 | + sed -i "s/#\(Port \).*/\12222/g" /etc/ssh/sshd_config && \ |
| 60 | + mkdir /var/run/sshd -p |
0 commit comments