Skip to content

Commit 6dcc0f7

Browse files
committed
fix #639 provide NCCL tests example
Signed-off-by: Sam Stoelinga <sammiestoel@gmail.com>
1 parent 52cda2c commit 6dcc0f7

File tree

2 files changed

+128
-0
lines changed

2 files changed

+128
-0
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
ARG CUDA_VERSION_MINOR=12.4.1
2+
ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION_MINOR}-devel-ubuntu22.04
3+
FROM ${BASE_IMAGE} as base
4+
5+
ARG CUDA_VERSION_MAJOR=12.4
6+
ARG TARGET_NCCL_VERSION=2.21.5-1
7+
8+
ARG DEBIAN_FRONTEND=noninteractive
9+
RUN apt-get -qq update && \
10+
apt-get -qq install -y \
11+
--allow-change-held-packages \
12+
--no-install-recommends \
13+
--allow-downgrades \
14+
build-essential libtool autoconf automake autotools-dev unzip \
15+
ca-certificates \
16+
wget curl openssh-server vim environment-modules \
17+
iputils-ping net-tools \
18+
libnuma1 libsubunit0 libpci-dev \
19+
libpmix-dev \
20+
datacenter-gpu-manager \
21+
g++ libopenmpi-dev openmpi-bin \
22+
libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \
23+
libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \
24+
git && \
25+
apt-get clean && \
26+
rm -rf /var/lib/apt/lists/*
27+
28+
RUN apt-get -qq update \
29+
&& apt-get -qq install -y --no-install-recommends \
30+
ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \
31+
&& rm -rf /var/lib/apt/lists/*
32+
33+
RUN apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl && \
34+
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
35+
echo "deb https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
36+
apt-get update && apt-get install -y google-cloud-sdk && \
37+
apt-get clean && rm -rf /var/lib/apt/lists/*
38+
39+
40+
41+
# NCCL Tests
42+
ENV NCCL_TESTS_COMMITISH=c6afef0
43+
ENV CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
44+
ENV CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
45+
WORKDIR /opt/nccl-tests
46+
RUN wget -q -O - https://github.com/NVIDIA/nccl-tests/archive/${NCCL_TESTS_COMMITISH}.tar.gz | tar --strip-components=1 -xzf - && \
47+
mpicc -show && \
48+
export CXX=mpic++ && \
49+
make -j20 MPI=1 MPI_HOME=/usr/include/openmpi NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" && \
50+
ln -s /opt/nccl-tests /opt/nccl_tests
51+
52+
RUN ldconfig
53+
54+
# SSH dependencies for MPI
55+
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
56+
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
57+
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
58+
sed -i "s/[ #]\(.*Port \).*/ \12222/g" /etc/ssh/ssh_config && \
59+
sed -i "s/#\(Port \).*/\12222/g" /etc/ssh/sshd_config && \
60+
mkdir /var/run/sshd -p
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
apiVersion: kubeflow.org/v2beta1
2+
kind: MPIJob
3+
metadata:
4+
name: nccl-tests
5+
spec:
6+
slotsPerWorker: 8
7+
runPolicy:
8+
cleanPodPolicy: Running
9+
activeDeadlineSeconds: 666
10+
mpiReplicaSpecs:
11+
Launcher:
12+
replicas: 1
13+
template:
14+
spec:
15+
restartPolicy: OnFailure
16+
containers:
17+
- image: mpioperator/nccl-tests:latest
18+
name: nccl
19+
securityContext:
20+
privileged: true
21+
env:
22+
- name: OMPI_ALLOW_RUN_AS_ROOT
23+
value: "1"
24+
- name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
25+
value: "1"
26+
- name: OMPI_MCA_orte_base_help_aggregate
27+
value: "0"
28+
command: ["/bin/bash", "-c"]
29+
args:
30+
- |
31+
set -xe
32+
export NCCL_DEBUG=INFO
33+
until mpirun -np 16 -x LD_LIBRARY_PATH -bind-to none /usr/local/nvidia/bin/nvidia-smi; do sleep 5; done
34+
mpirun -np ${NP} -bind-to none \
35+
-x NCCL_DEBUG \
36+
/opt/nccl_tests/build/all_reduce_perf -c 0 -b 8 -e 16G \
37+
-f 4 -g 1 -n 10
38+
resources:
39+
requests:
40+
cpu: 50m
41+
memory: 128Mi
42+
enableServiceLinks: false
43+
automountServiceAccountToken: false
44+
Worker:
45+
replicas: 2
46+
template:
47+
metadata:
48+
annotations:
49+
spec:
50+
volumes:
51+
- name: shared-memory
52+
emptyDir:
53+
medium: "Memory"
54+
55+
containers:
56+
- image: mpioperator/nccl-tests:latest
57+
name: nccl
58+
securityContext:
59+
privileged: true
60+
resources:
61+
limits:
62+
nvidia.com/gpu: 8
63+
volumeMounts:
64+
- name: shared-memory
65+
mountPath: /dev/shm
66+
67+
enableServiceLinks: false
68+
automountServiceAccountToken: false

0 commit comments

Comments
 (0)