forked from NVIDIA-Merlin/Merlin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdockerfile.ctr
125 lines (101 loc) · 4.95 KB
/
dockerfile.ctr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# syntax=docker/dockerfile:1.2
ARG MERLIN_VERSION=22.12
ARG TRITON_VERSION=22.11
ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin-base:${MERLIN_VERSION}
FROM ${BASE_IMAGE} as base
ARG HUGECTR_VER=main
ARG HUGECTR_BACKEND_VER=main
# Envs
ENV CUDA_SHORT_VERSION=11.6
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib
ENV CUDA_HOME=/usr/local/cuda
ENV CUDA_PATH=$CUDA_HOME
ENV CUDA_CUDA_LIBRARY=${CUDA_HOME}/lib64/stubs
ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
ENV PATH=$PATH:/usr/lib/x86_64-linux-gnu/
RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
RUN pip install --no-cache-dir --upgrade notebook ipython
RUN pip install --no-cache-dir mpi4py
# Install CUDA-Aware hwloc
ARG HWLOC_VER=2.4.1
RUN cd /opt/hpcx/ompi/include/openmpi/opal/mca/hwloc/hwloc201 && rm -rfv hwloc201.h hwloc/include/hwloc.h
RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://download.open-mpi.org/release/hwloc/v2.4/hwloc-${HWLOC_VER}.tar.gz && \
mkdir -p /var/tmp && tar -x -f /var/tmp/hwloc-${HWLOC_VER}.tar.gz -C /var/tmp && \
cd /var/tmp/hwloc-${HWLOC_VER} && \
./configure CPPFLAGS="-I/usr/local/cuda/include/ -L/usr/local/cuda/lib64/" LDFLAGS="-L/usr/local/cuda/lib64" --enable-cuda && \
make -j$(nproc) && make install && \
rm -rf /var/tmp/hwloc-${HWLOC_VER} /var/tmp/hwloc-${HWLOC_VER}.tar.gz
# -----------------------------------------------------------------------------
# HugeCTR + Dependencies
# Optional dependency: Build and install protocol buffers and Hadoop/HDFS.
ARG INSTALL_HDFS=false
# Arguments "_XXXX" are only valid when $HUGECTR_DEV_MODE==false
ARG HUGECTR_DEV_MODE=false
ARG _HUGECTR_REPO="github.com/NVIDIA-Merlin/HugeCTR.git"
ARG _CI_JOB_TOKEN=""
ENV OMPI_MCA_plm_rsh_agent=ssh
ENV OMPI_MCA_opal_cuda_support=true
ENV NCCL_LAUNCH_MODE=PARALLEL
ENV NCCL_COLLNET_ENABLE=0
ENV SHARP_COLL_NUM_COLL_GROUP_RESOURCE_ALLOC_THRESHOLD=0
ENV SHARP_COLL_LOCK_ON_COMM_INIT=1
ENV SHARP_COLL_LOG_LEVEL=3
ENV HCOLL_ENABLE_MCAST=0
# link sub modules expected by hugectr cmake
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so
RUN ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so
RUN rm -rf /usr/lib/x86_64-linux-gnu/libibverbs.so && \
ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1.14.36.0 /usr/lib/x86_64-linux-gnu/libibverbs.so
# Install HugeCTR
ARG HUGECTR_HOME=/usr/local/hugectr
RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
cd /hugectr && \
git submodule update --init --recursive && \
mkdir build && \
cd build && \
LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/:$LD_LIBRARY_PATH && \
export PATH=$PATH:/usr/local/cuda-${CUDA_SHORT_VERSION}/compat && \
if [[ "${INSTALL_HDFS}" == "false" ]]; then \
cmake -DCMAKE_BUILD_TYPE=Release -DSM="60;61;70;75;80" -DENABLE_MULTINODES=ON .. \
; else \
cmake -DCMAKE_BUILD_TYPE=Release -DSM="60;61;70;75;80" -DENABLE_MULTINODES=ON -DENABLE_HDFS=ON .. \
; fi && \
make -j$(nproc) && \
make install && \
rm -rf ./* && \
chmod +x ${HUGECTR_HOME}/bin/* ${HUGECTR_HOME}/lib/*.so && \
cd ../onnx_converter && \
python setup.py install && \
mv /hugectr/ci ~/hugectr-ci && rm -rf /hugectr && mkdir -p /hugectr && mv ~/hugectr-ci /hugectr/ci \
; fi
ENV PATH=$PATH:${HUGECTR_HOME}/bin \
CPATH=$CPATH:${HUGECTR_HOME}/include \
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib \
PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib
ARG _HUGECTR_BACKEND_REPO="github.com/triton-inference-server/hugectr_backend.git"
ARG TRITON_VERSION
# Install Triton inference backend.
RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \
git clone --branch ${HUGECTR_BACKEND_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_BACKEND_REPO} /repos/hugectr_triton_backend && \
mkdir /repos/hugectr_triton_backend/build && \
cd /repos/hugectr_triton_backend/build && \
cmake \
-DCMAKE_INSTALL_PREFIX:PATH=${HUGECTR_HOME} \
-DTRITON_COMMON_REPO_TAG="r${TRITON_VERSION}" \
-DTRITON_CORE_REPO_TAG="r${TRITON_VERSION}" \
-DTRITON_BACKEND_REPO_TAG="r${TRITON_VERSION}" .. && \
make -j$(nproc) && \
make install && \
cd ../.. && \
rm -rf hugectr_triton_backend && \
chmod +x ${HUGECTR_HOME}/lib/*.so ${HUGECTR_HOME}/backends/hugectr/*.so && \
rm -rf /repos \
; fi
RUN ln -s ${HUGECTR_HOME}/backends/hugectr /opt/tritonserver/backends/hugectr
# Remove fake lib
RUN rm /usr/local/cuda/lib64/stubs/libcuda.so.1
# Clean up
RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/marked
RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/node-fetch