Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,18 @@ jobs:
fail-fast: true
matrix:
include:
- name: cuda13.0-arm64
dockerfile: cuda13.0
tags: superbench/main:cuda13.0-arm64
platforms: linux/arm64
runner: [self-hosted, linux/arm64]
build_args: "NUM_MAKE_JOBS=16"
- name: cuda13.0-amd64
dockerfile: cuda13.0
tags: superbench/main:cuda13.0-amd64
platforms: linux/amd64
runner: [self-hosted, linux/amd64]
build_args: "NUM_MAKE_JOBS=16"
- name: cuda12.8-arm64
dockerfile: cuda12.8
tags: superbench/main:cuda12.8-arm64
Expand Down
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
url = https://github.com/netbench/GPCNET.git
[submodule "third_party/gpu-burn"]
path = third_party/gpu-burn
url = https://github.com/wilicc/gpu-burn.git
url = https://github.com/WenqingLan1/gpu-burn.git
[submodule "third_party/msccl"]
path = third_party/msccl
url = https://github.com/Azure/msccl
Expand Down
161 changes: 161 additions & 0 deletions dockerfile/cuda13.0.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
FROM nvcr.io/nvidia/pytorch:25.08-py3

# OS:
# - Ubuntu: 24.04
# - OpenMPI: 4.1.9a1
# - Docker Client: 20.10.8 (installed in this dockerfile)
# NVIDIA:
# - CUDA: 13.0.0.044
# - cuDNN: 9.12.0.46
# - cuBLAS: 13.0.0.19
# - NCCL: 2.27.7
# - TransformerEngine: v2.5
# - torch: 2.8.0a0+34c6371d24
# Mellanox:
# - MOFED_VERSION: (installed in this dockerfile)
# - HPC-X: 2.24
# Intel:
# - mlc: ??? (amd64 only)

LABEL maintainer="SuperBench"

ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update && \
apt-get install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
ffmpeg \
git \
iproute2 \
jq \
libaio-dev \
libavcodec-dev \
libavformat-dev \
libavutil-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libswresample-dev \
libncurses-dev \
libtool \
lshw \
python3-mpi4py \
net-tools \
nlohmann-json3-dev \
openssh-client \
openssh-server \
pciutils \
sudo \
util-linux \
vim \
wget \
rsync \
&& \
apt-get autoremove && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

ARG NUM_MAKE_JOBS=
ARG TARGETPLATFORM
ARG TARGETARCH

# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN TARGETARCH_HW=$(uname -m) && \
wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz

# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf

# Install OFED
ENV OFED_VERSION=24.10-1.1.4.0
RUN TARGETARCH_HW=$(uname -m) && \
cd /tmp && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*

# Install HPC-X
ENV HPCX_VERSION=v2.24.1
RUN TARGETARCH_HW=$(uname -m) && \
cd /opt && \
rm -rf hpcx && \
wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}_cuda13/hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda13-${TARGETARCH_HW}.tbz -O hpcx.tbz && \
tar xf hpcx.tbz && \
mv hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda13-${TARGETARCH_HW} hpcx && \
rm hpcx.tbz

# Installs specific to amd64 platform
RUN if [ "$TARGETARCH" = "amd64" ]; then \
# Install Intel MLC
cd /tmp && \
wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz && \
# Install AOCC compiler
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
# Install AMD BLIS
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
else \
echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
fi

# Install UCX with multi-threading support
ENV UCX_VERSION=1.18.0
RUN cd /tmp && \
wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}-rc1/ucx-${UCX_VERSION}.tar.gz && \
tar xzf ucx-${UCX_VERSION}.tar.gz && \
cd ucx-${UCX_VERSION} && \
./contrib/configure-release-mt --prefix=/usr/local && \
make -j ${NUM_MAKE_JOBS} && \
make install

ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \
echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" | tee -a /etc/bash.bashrc >> /etc/profile.d/10-hpcx.sh

# Add config files
ADD dockerfile/etc /opt/microsoft/

WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make -C third_party cuda

ADD . .
RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
python3 -m pip install --no-cache-dir .[nvworker] && \
make cppbuild && \
make postinstall && \
rm -rf .git
7 changes: 7 additions & 0 deletions superbench/benchmarks/micro_benchmarks/cuda_common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,11 @@ if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8)
list(APPEND NVCC_ARCHS_SUPPORTED 100)
endif()
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.9)
list(APPEND NVCC_ARCHS_SUPPORTED 103)
endif()
# CUDA 13.0+ drops support for archs before sm_75, remove them
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
list(REMOVE_ITEM NVCC_ARCHS_SUPPORTED 53 60 61 70 72)
endif()
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,12 @@ void ParseCommandLine(int argc, char *argv[], char *szInputFileName, int &iGpu,
OptimizedNvDecoder *InitOptimizedNvDecoder(int i, const CUdevice &cuDevice, CUcontext &cuContext, bool bSingle,
bool bHost, cudaVideoCodec codec, CUVIDDECODECAPS decodecaps) {
if (!bSingle) {
#if CUDA_VERSION >= 13000
CUctxCreateParams ctxCreateParams = {};
ck(cuCtxCreate(&cuContext, &ctxCreateParams, 0, cuDevice));
#else
ck(cuCtxCreate(&cuContext, 0, cuDevice));
#endif
}
OptimizedNvDecoder *sessionObject = new OptimizedNvDecoder(cuContext, !bHost, codec, decodecaps);
sessionObject->setDecoderSessionID(i);
Expand Down Expand Up @@ -247,7 +252,12 @@ void InitializeContext(std::vector<OptimizedNvDecoder *> &vDec, int iGpu, int nT
std::cout << "GPU in use: " << szDeviceName << std::endl;

CUcontext cuContext = NULL;
#if CUDA_VERSION >= 13000
CUctxCreateParams ctxCreateParams = {};
ck(cuCtxCreate(&cuContext, &ctxCreateParams, 0, cuDevice));
#else
ck(cuCtxCreate(&cuContext, 0, cuDevice));
#endif

CUVIDDECODECAPS decodecaps;
GetDefaultDecoderCaps(decodecaps, codec);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(self, name, parameters=''):
# Skip INT4 for Hopper due to no native CUDA/Tensor Cores
self.__kernel_map[9.0] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'int4_tc' not in k}
self.__kernel_map[10.0] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'int4_tc' not in k}
self.__kernel_map[10.3] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'int4_tc' not in k}
self.__parse_logline = [
'gemm,cutlass_simt_dgemm_128x128_8x2', 'gemm,cutlass_simt_sgemm_128x128_8x2',
'gemm,cutlass_simt_hgemm_256x128_8x2', 'gemm,cutlass_tensorop_d884gemm_128x128_16x3',
Expand Down
Loading
Loading