Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
741ee98
add nvbench kernel launch
WenqingLan1 Jul 22, 2025
0ae7864
submodule update
WenqingLan1 Jul 22, 2025
35bfb61
init sleep kernel
WenqingLan1 Jul 30, 2025
66b4786
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Aug 25, 2025
82aed0c
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Sep 22, 2025
24ee0a5
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Oct 8, 2025
bd87f50
test sleep kernel
WenqingLan1 Oct 8, 2025
a663db6
add sm 103
WenqingLan1 Oct 8, 2025
32fe197
add arg parsing logic
WenqingLan1 Oct 8, 2025
76562dc
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Oct 8, 2025
3eb5525
add arg parsing tests
WenqingLan1 Oct 9, 2025
4785fe6
refactor
WenqingLan1 Oct 9, 2025
1fb7c05
refine logic - remove gpu_id
WenqingLan1 Oct 9, 2025
83c442c
add doc
WenqingLan1 Oct 9, 2025
4b274c4
refine regex & update nvbench submodule
WenqingLan1 Oct 9, 2025
0cf48bb
update cmake
WenqingLan1 Oct 10, 2025
5905647
fix lint
WenqingLan1 Oct 10, 2025
baa57c9
fix lint
WenqingLan1 Oct 10, 2025
ecce2d9
fix import
WenqingLan1 Oct 10, 2025
3a58ead
fix
WenqingLan1 Oct 10, 2025
d0d8773
fix
WenqingLan1 Oct 10, 2025
fbb5969
fix
WenqingLan1 Oct 10, 2025
f007745
fix
WenqingLan1 Oct 10, 2025
b6b6082
fix
WenqingLan1 Oct 10, 2025
0f2c838
fix
WenqingLan1 Oct 10, 2025
5bd20f6
fix
WenqingLan1 Oct 10, 2025
ab88d25
fix pipeline
WenqingLan1 Oct 10, 2025
3faaf60
fix cmake
WenqingLan1 Oct 13, 2025
896a46a
fix pipeline
WenqingLan1 Oct 14, 2025
5d4986b
fix pipeline
WenqingLan1 Oct 14, 2025
b246522
fix pipeline & mlc version
WenqingLan1 Oct 14, 2025
ffe182e
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Dec 17, 2025
2877feb
Merge branch 'main' into feat/third_party/nvbench
WenqingLan1 Dec 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
- name: Initialize CodeQL
uses: github/codeql-action/init@v2
uses: github/codeql-action/init@v3
with:
languages: ${{ matrix.language }}
- name: Autobuild
uses: github/codeql-action/autobuild@v2
uses: github/codeql-action/autobuild@v3
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2
uses: github/codeql-action/analyze@v3
analyze-cpp:
name: CodeQL analyze cpp
runs-on: ubuntu-latest
Expand All @@ -52,12 +52,16 @@ jobs:
- name: Install Dependency
run: |
DEBIAN_FRONTEND=noninteractive apt-get update
DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo build-essential
- name: Setup CMake
uses: lukka/get-cmake@latest
with:
cmakeVersion: '3.20.0'
- name: Initialize CodeQL
uses: github/codeql-action/init@v2
uses: github/codeql-action/init@v3
with:
languages: cpp
- name: Build
run: make cppbuild -j
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2
uses: github/codeql-action/analyze@v3
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ cython_debug/
*.userosscache
*.sln.docstates

# Build temporary files
compile_commands.json

# Build results
[Dd]ebug/
[Dd]ebugPublic/
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,6 @@
[submodule "third_party/nvbandwidth"]
path = third_party/nvbandwidth
url = https://github.com/NVIDIA/nvbandwidth.git
[submodule "third_party/nvbench"]
path = third_party/nvbench
url = https://github.com/NVIDIA/nvbench.git
23 changes: 22 additions & 1 deletion dockerfile/cuda12.8.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,27 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

# Install CMake 3.30.4 for nvbench compatibility
RUN apt-get update && \
apt-get remove -y cmake cmake-data && \
apt-get autoremove -y && \
cd /tmp && \
ARCH=$(uname -m) && \
case ${ARCH} in \
"aarch64") CMAKE_ARCH="aarch64" ;; \
"x86_64") CMAKE_ARCH="x86_64" ;; \
"arm64") CMAKE_ARCH="aarch64" ;; \
*) CMAKE_ARCH="x86_64" ;; \
esac && \
echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ARG NUM_MAKE_JOBS=
ARG TARGETPLATFORM
ARG TARGETARCH
Expand Down Expand Up @@ -161,7 +182,7 @@ ADD dockerfile/etc /opt/microsoft/
WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make -C third_party cuda_with_msccl
RUN make -C third_party cuda_with_msccl cuda_nvbench

ADD . .
RUN python3 -m pip install --upgrade setuptools==70.3.0 && \
Expand Down
24 changes: 23 additions & 1 deletion dockerfile/cuda12.9.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,28 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

# Install CMake 3.30.4 for nvbench compatibility
RUN apt-get update && \
apt-get remove -y cmake cmake-data && \
apt-get autoremove -y && \
cd /tmp && \
ARCH=$(uname -m) && \
case ${ARCH} in \
"aarch64") CMAKE_ARCH="aarch64" ;; \
"x86_64") CMAKE_ARCH="x86_64" ;; \
"arm64") CMAKE_ARCH="aarch64" ;; \
*) CMAKE_ARCH="x86_64" ;; \
esac && \
echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*


ARG NUM_MAKE_JOBS=
ARG TARGETPLATFORM
ARG TARGETARCH
Expand Down Expand Up @@ -162,7 +184,7 @@ ADD dockerfile/etc /opt/microsoft/
WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make -C third_party cuda_with_msccl
RUN make -C third_party cuda_with_msccl cuda_nvbench

ADD . .
RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
Expand Down
23 changes: 22 additions & 1 deletion dockerfile/cuda13.0.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,27 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

# Install CMake 3.30.4 for nvbench compatibility
RUN apt-get update && \
apt-get remove -y cmake cmake-data && \
apt-get autoremove -y && \
cd /tmp && \
ARCH=$(uname -m) && \
case ${ARCH} in \
"aarch64") CMAKE_ARCH="aarch64" ;; \
"x86_64") CMAKE_ARCH="x86_64" ;; \
"arm64") CMAKE_ARCH="aarch64" ;; \
*) CMAKE_ARCH="x86_64" ;; \
esac && \
echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ARG NUM_MAKE_JOBS=
ARG TARGETPLATFORM
ARG TARGETARCH
Expand Down Expand Up @@ -151,7 +172,7 @@ ADD dockerfile/etc /opt/microsoft/
WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make -C third_party cuda
RUN make -C third_party cuda cuda_nvbench

ADD . .
RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
Expand Down
2 changes: 1 addition & 1 deletion dockerfile/rocm5.0.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ RUN cd /tmp && \

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz
Expand Down
45 changes: 45 additions & 0 deletions docs/user-tutorial/benchmarks/micro-benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,51 @@ Supports the use of double unit types and the use of tensor cores.
| gpu-burn/gpu_[0-9]_pass | yes/no | The result of the gpu-burn test for each GPU (1: yes, 0: no). |
| gpu-burn/abort | yes/no | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |

### `nvbench-sleep-kernel`

#### Introduction

Measure GPU kernel execution time using NVBench's sleep kernel benchmark. This benchmark creates CUDA kernels that sleep for specified durations (in microseconds) and measures the actual execution time, providing insights into GPU scheduling overhead and timing accuracy.

The benchmark supports multiple duration specification formats:
- Single value: `"50"` - Test single duration of 50μs
- List format: `"[25,50,75]"` - Test multiple specific durations
- Range format: `"[25:75]"` - Test all values from 25μs to 75μs
- Range with step: `"[0:50:10]"` - Test from 0μs to 50μs in steps of 10μs

Performed by [NVBench](https://github.com/NVIDIA/nvbench) sleep kernel benchmark.

#### Metrics

| Name | Unit | Description |
|-----------------------------------------|-----------|-------------------------------------------------------|
| nvbench-sleep-kernel/duration_us_{X}_cpu_time | time (μs) | CPU-measured time for duration X microseconds. |
| nvbench-sleep-kernel/duration_us_{X}_gpu_time | time (μs) | GPU-measured time for duration X microseconds. |
| nvbench-sleep-kernel/duration_us_{X}_batch_gpu_time | time (μs) | GPU batch execution time for duration X microseconds. |

Where `{X}` is the sleep duration in microseconds (e.g., 25, 50, 75).

### `nvbench-kernel-launch`

#### Introduction

Measure GPU kernel launch overhead and execution time using NVBench's kernel launch benchmark. This benchmark evaluates the time required to launch kernels on the GPU and measures both CPU-side and GPU-side timing for kernel execution.

The benchmark provides insights into:
- Kernel launch latency
- CPU/GPU synchronization overhead
- Batch execution performance

Performed by [NVBench](https://github.com/NVIDIA/nvbench) kernel launch benchmark.

#### Metrics

| Name | Unit | Description |
|-------------------------------------|-----------|------------------------------------------------|
| nvbench-kernel-launch/cpu_time | time (μs) | CPU-measured kernel execution time. |
| nvbench-kernel-launch/gpu_time | time (μs) | GPU-measured kernel execution time. |
| nvbench-kernel-launch/batch_gpu_time | time (μs) | GPU batch execution time. |

### `cpu-hpl`

#### Introduction
Expand Down
30 changes: 30 additions & 0 deletions examples/benchmarks/nvbench_kernel_launch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Example of NVBench Kernel Launch benchmark."""

from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger

if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'nvbench-kernel-launch',
platform=Platform.CUDA,
parameters=(
'--timeout 30 '
'--min-samples 10 '
'--min-time 1.0 '
'--max-noise 0.1 '
'--stopping-criterion stdrel '
'--throttle-threshold 80 '
'--throttle-recovery-delay 1.0'
)
)

benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)
28 changes: 28 additions & 0 deletions examples/benchmarks/nvbench_sleep_kernel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Example of NVBench Sleep Kernel benchmark."""

from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger


def main():
"""Main method to run the nvbench sleep kernel benchmark."""
context = BenchmarkRegistry.create_benchmark_context(
'nvbench-sleep-kernel', platform=Platform.CUDA, parameters='--duration_us "[25,50,75]" --timeout 10'
)

benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)
else:
logger.error('benchmark: nvbench-sleep-kernel launch failed.')


if __name__ == '__main__':
main()
46 changes: 10 additions & 36 deletions superbench/benchmarks/micro_benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,42 +39,16 @@
from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark
from superbench.benchmarks.micro_benchmarks.nvbench_kernel_launch import NvbenchKernelLaunch
from superbench.benchmarks.micro_benchmarks.nvbench_sleep_kernel import NvbenchSleepKernel

__all__ = [
'BlasLtBaseBenchmark',
'ComputationCommunicationOverlap',
'CpuMemBwLatencyBenchmark',
'CpuHplBenchmark',
'CpuStreamBenchmark',
'CublasBenchmark',
'CublasLtBenchmark',
'CudaGemmFlopsBenchmark',
'CudaMemBwBenchmark',
'CudaNcclBwBenchmark',
'CudnnBenchmark',
'DiskBenchmark',
'DistInference',
'HipBlasLtBenchmark',
'GPCNetBenchmark',
'GemmFlopsBenchmark',
'GpuBurnBenchmark',
'GpuCopyBwBenchmark',
'GpuStreamBenchmark',
'IBBenchmark',
'IBLoopbackBenchmark',
'KernelLaunch',
'MemBwBenchmark',
'MicroBenchmark',
'MicroBenchmarkWithInvoke',
'ORTInferenceBenchmark',
'RocmGemmFlopsBenchmark',
'RocmMemBwBenchmark',
'ShardingMatmul',
'TCPConnectivityBenchmark',
'TensorRTInferenceBenchmark',
'DirectXGPUEncodingLatency',
'DirectXGPUCopyBw',
'DirectXGPUMemBw',
'DirectXGPUCoreFlops',
'NvBandwidthBenchmark',
'BlasLtBaseBenchmark', 'ComputationCommunicationOverlap', 'CpuMemBwLatencyBenchmark', 'CpuHplBenchmark',
'CpuStreamBenchmark', 'CublasBenchmark', 'CublasLtBenchmark', 'CudaGemmFlopsBenchmark', 'CudaMemBwBenchmark',
'CudaNcclBwBenchmark', 'CudnnBenchmark', 'DiskBenchmark', 'DistInference', 'HipBlasLtBenchmark', 'GPCNetBenchmark',
'GemmFlopsBenchmark', 'GpuBurnBenchmark', 'GpuCopyBwBenchmark', 'GpuStreamBenchmark', 'IBBenchmark',
'IBLoopbackBenchmark', 'KernelLaunch', 'MemBwBenchmark', 'MicroBenchmark', 'MicroBenchmarkWithInvoke',
'ORTInferenceBenchmark', 'RocmGemmFlopsBenchmark', 'RocmMemBwBenchmark', 'ShardingMatmul',
'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', 'DirectXGPUEncodingLatency', 'DirectXGPUCopyBw',
'DirectXGPUMemBw', 'DirectXGPUCoreFlops', 'NvBandwidthBenchmark', 'NvbenchKernelLaunch', 'NvbenchSleepKernel'
]
45 changes: 45 additions & 0 deletions superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
cmake_minimum_required(VERSION 3.18)
project(nvbench_benchmarks LANGUAGES CUDA)

# Check if we have a recent enough CMake for nvbench (which requires 3.30.4)
if(CMAKE_VERSION VERSION_LESS "3.30.4")
message(STATUS "CMake version ${CMAKE_VERSION} is less than 3.30.4 (required by nvbench), skipping nvbench benchmarks")
return()
endif()

find_package(CUDAToolkit QUIET)
if (CUDAToolkit_FOUND)
include(../cuda_common.cmake)

# Try to find nvbench, but don't require it
find_package(nvbench CONFIG QUIET)

if (nvbench_FOUND)
message(STATUS "Found nvbench, building nvbench benchmarks")

# list all your CUDA benchmark source files here
set(NVBENCH_SOURCES
kernel_launch.cu
sleep_kernel.cu
# add more *.cu as needed
)

foreach(src ${NVBENCH_SOURCES})
# strip ".cu" → NAME_WE
get_filename_component(basename ${src} NAME_WE)
set(target nvbench_${basename})

add_executable(${target} ${src})
target_compile_features(${target} PUBLIC cuda_std_17)
target_link_libraries(${target}
PRIVATE nvbench::nvbench nvbench::main
)
install(TARGETS ${target} RUNTIME DESTINATION bin)
endforeach()
else()
message(STATUS "nvbench not found, skipping nvbench benchmarks.")
message(STATUS "To build nvbench benchmarks, first build the submodule in third_party/nvbench")
endif()
else()
message(STATUS "CUDA not found, skipping nvbench benchmarks.")
endif()
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#include <nvbench/nvbench.cuh>

__global__ void empty_kernel() {}

void kernel_launch(nvbench::state &state) {
state.exec([](nvbench::launch &launch) { empty_kernel<<<1, 1, 0, launch.get_stream()>>>(); });
}

NVBENCH_BENCH(kernel_launch);
Loading
Loading