Skip to content

Commit

Permalink
CUTLASS 3.2.1 (NVIDIA#1113)
Browse files Browse the repository at this point in the history
* Updates for 3.2.1 release.

* Minor fix in gemm op profiler for raster order.

* Add scheduler mapping for raster order in the kernels.
  • Loading branch information
ANIKET-SHIVAM authored Sep 26, 2023
1 parent e0aaa3c commit 90d3b0f
Show file tree
Hide file tree
Showing 428 changed files with 22,241 additions and 21,750 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# NVIDIA CUTLASS Changelog

## [3.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.1) (2023-09-22)
* Python support SM90 Epilogue Visitor Tree (EVT) on top of the C++ support released in 3.2.0.
* SM80 EVT support in C++ and Python.
* Other SM90 epilogue improvements.
* Splitting CUTLASS library into smaller units based on operation, arch and datatypes. See [1105](https://github.com/NVIDIA/cutlass/discussions/1105) for details.
* Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](/python/README.md) for details.
* SM90 TF32 kernel improvements for all layouts.
* SM90 rasterization direction support in the CUTLASS profiler.
* Improvement for CUTLASS profiler build times.
* Remove Python-C++ bindings.

## [3.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.0) (2023-08-03)

Expand Down Expand Up @@ -91,7 +101,7 @@
* [Few channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities
* [Fixed channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size
* [Unit tests](/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu)
* [Python-based instance emitter](/tools/library/scripts/generator.py) in the CUTLASS Library and support in the Profiler
* [Python-based instance emitter](/python/cutlass_library/generator.py) in the CUTLASS Library and support in the Profiler
* [BLAS3](https://docs.nvidia.com/cuda/cublas/index.html#cublas-level-3-function-reference) operators accelerated by Tensor Cores
* Supported types: f32, cf32, f64, cf64, tf32x3, complex tf32x3
* [HERK](/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](/tools/library/scripts/rank_k_operation.py)
Expand Down
59 changes: 43 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ endif()
message(STATUS "CMake Version: ${CMAKE_VERSION}")
set(IMPLICIT_CMAKE_CXX_STANDARD OFF CACHE BOOL "Do not explicitly specify -std=c++11 if set")

project(CUTLASS VERSION 3.2.0 LANGUAGES CXX)
project(CUTLASS VERSION 3.2.1 LANGUAGES CXX)
include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)

if (CUDA_VERSION VERSION_LESS 11.3)
Expand Down Expand Up @@ -85,17 +85,38 @@ message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}")
set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
# 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive

find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)

# Install cutlass_library Python package
execute_process(
WORKING_DIRECTORY ${CUTLASS_DIR}/python
COMMAND ${Python3_EXECUTABLE} ${CUTLASS_DIR}/python/setup_library.py develop --user
RESULT_VARIABLE cutlass_lib_GENERATOR_INSTALL_RESULT
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log
)

if(NOT cutlass_lib_GENERATOR_INSTALL_RESULT EQUAL 0)
message(FATAL_ERROR "Error installing cutlass_library package. See ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log")
endif()

################################################################################
set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")

if(CUTLASS_ENABLE_HEADERS_ONLY)
set(CUTLASS_ENABLE_EXAMPLES_INIT OFF)
set(CUTLASS_ENABLE_TOOLS_INIT ON)
set(CUTLASS_ENABLE_LIBRARY_INIT OFF)
set(CUTLASS_ENABLE_TESTS_INIT OFF)
else()
set(CUTLASS_ENABLE_EXAMPLES_INIT ON)
set(CUTLASS_ENABLE_TOOLS_INIT ON)
set(CUTLASS_ENABLE_LIBRARY_INIT ON)
if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
set(CUTLASS_ENABLE_TESTS_INIT ON)
else()
set(CUTLASS_ENABLE_TESTS_INIT OFF)
endif()
endif()

set(CUTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
Expand All @@ -104,20 +125,10 @@ set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable C
set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools")
set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_LIBRARY_INIT} CACHE BOOL "Enable CUTLASS Library")
set(CUTLASS_ENABLE_PROFILER ${CUTLASS_ENABLE_LIBRARY} CACHE BOOL "Enable CUTLASS Profiler")
set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Proformance")

if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
set(CUTLASS_ENABLE_TESTS_INIT ${CUTLASS_ENABLE_LIBRARY})
else()
set(CUTLASS_ENABLE_TESTS_INIT OFF)
endif()
set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Performance")

set(CUTLASS_ENABLE_TESTS ${CUTLASS_ENABLE_TESTS_INIT} CACHE BOOL "Enable CUTLASS Tests")

if (CUTLASS_ENABLE_TESTS)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
endif()

set(CUTLASS_ENABLE_GTEST_UNIT_TESTS ${CUTLASS_ENABLE_TESTS} CACHE BOOL "Enable CUTLASS GTest-based Unit Tests")
################################################################################

set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
Expand Down Expand Up @@ -285,6 +296,8 @@ if (CUTLASS_ENABLE_TENSOR_CORE_MMA)
endif()




if (NOT MSVC AND CUTLASS_NVCC_KEEP)
# MSVC flow handles caching already, but for other generators we handle it here.
set(CUTLASS_NVCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store NVCC scratch files")
Expand Down Expand Up @@ -395,6 +408,7 @@ endif()
# Some tests require this build option in order to link.
if (MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj")
endif()

function(cutlass_apply_cuda_gencode_flags TARGET)
Expand Down Expand Up @@ -572,11 +586,17 @@ target_include_directories(
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CUTLASS_INCLUDE_DIR}>
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
$<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include>
$<BUILD_INTERFACE:${cute_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${cute_SOURCE_DIR}/examples>
)

# Mark CTK headers as system to supress warnings from them
target_include_directories(
CUTLASS
SYSTEM INTERFACE
$<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include>
)

install(
DIRECTORY
${CUTLASS_INCLUDE_DIR}/
Expand Down Expand Up @@ -633,6 +653,11 @@ endif()

include(CTest)
enable_testing()

if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
endif()

if (NOT TARGET test_all)
add_custom_target(test_all)
endif()
Expand Down Expand Up @@ -818,7 +843,7 @@ function(cutlass_add_executable_tests NAME TARGET)

set(CUTLASS_CTEST_GENERATED_FILES ${CUTLASS_CTEST_GENERATED_FILES};ctest/${TEST_NAME}/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")

if (CUTLASS_INSTALL_TESTS)
if (CUTLASS_INSTALL_TESTS)

file(GENERATE
OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake"
Expand All @@ -831,7 +856,7 @@ function(cutlass_add_executable_tests NAME TARGET)
RENAME CTestTestfile.${TEST_NAME}.cmake
)

endif()
endif()

endfunction()

Expand All @@ -849,7 +874,9 @@ endif()

if (CUTLASS_ENABLE_TESTS)
add_subdirectory(test)
if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
add_dependencies(test_all test_unit)
endif()
endif()

if (CUTLASS_INSTALL_TESTS)
Expand Down
4 changes: 2 additions & 2 deletions CUDA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -305,10 +305,10 @@ function(cutlass_add_library NAME)

if(CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "clang")
cutlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
add_library(${NAME} ${TARGET_SOURCE_ARGS})
add_library(${NAME} ${TARGET_SOURCE_ARGS} "")
else()
set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
cuda_add_library(${NAME} ${TARGET_SOURCE_ARGS})
cuda_add_library(${NAME} ${TARGET_SOURCE_ARGS} "")
endif()

cutlass_apply_standard_compile_options(${NAME})
Expand Down
15 changes: 12 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ In addition to GEMMs, CUTLASS implements high-performance convolution via the im

# What's New in CUTLASS 3.2

CUTLASS 3.2 is an update to CUTLASS adding:
CUTLASS 3.2.0 is an update to CUTLASS adding:
- New warp-specialized persistent FP8 GEMM kernel [kernel schedules](/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) and [mainloops](/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp) targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. An example showcasing [Hopper warp-specialized FP8 GEMMs](/examples/54_hopper_fp8_warp_specialized_gemm).
- New [Epilogue Visitor Tree (EVT)](/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu) support for Hopper TMA epilogues. EVTs allows for user-defined customized epilogue fusion patterns without having to write a new epilogue.
- [Stream-K](/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp) feature for Hopper. Note that this is only a functional implementation of stream-K, and should not be used for performance comparison. Optimizations are expected in a future release.
Expand All @@ -53,6 +53,14 @@ CUTLASS 3.2 is an update to CUTLASS adding:
- New CUTLASS 2D Convolution Python interface. New [example](/examples/python/03_basic_conv2d.ipynb) here.
- Support for Windows (MSVC) builds.

CUTLASS 3.2.1 is an update to CUTLASS adding:
- Python support SM90 Epilogue Visitor Tree (EVT) on top of the C++ support released in 3.2.0.
- SM80 EVT support in C++ and Python.
- Splitting CUTLASS library into smaller units based on operation, arch and datatypes. See [1105](https://github.com/NVIDIA/cutlass/discussions/1105) for details.
- Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](/python/README.md) for details.
- SM90 TF32 kernel improvements for all layouts.
- SM90 rasterization direction support in the CUTLASS profiler.
- Improvement for CUTLASS profiler build times.

Minimum requirements:

Expand Down Expand Up @@ -176,7 +184,8 @@ CUTLASS is a header-only template library and does not need to be built to be us
projects. Client applications should target CUTLASS's `include/` directory in their include
paths.

CUTLASS unit tests, examples, and utilities can be build with CMake starting version 3.12.
CUTLASS unit tests, examples, and utilities can be build with CMake.
The minimum version of CMake is given in the [Quickstart guide](media/docs/quickstart.md).
Make sure the `CUDACXX` environment variable points to NVCC in the CUDA Toolkit installed
on your system.

Expand Down Expand Up @@ -512,7 +521,7 @@ reference_device: Passed
## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler
- Please follow the links for more CMake examples on selectively compiling CUTLASS kernels:
- [GEMM CMake Examples](media/docs/quickstart.md#gemm-cmake-examples)
- [Implicit GEMM conovlution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
- [Implicit GEMM convolution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
- [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)


Expand Down
9 changes: 7 additions & 2 deletions cmake/NvidiaCutlassConfig.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ get_filename_component(NvidiaCutlass_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH

include(CMakeFindDependencyMacro)

if(NOT TARGET nvidia::cutlass::CUTLASS)
include("${NvidiaCutlass_CMAKE_DIR}/NvidiaCutlassTargets.cmake")
if(TARGET nvidia::cutlass::CUTLASS)
return()
endif()

include("${NvidiaCutlass_CMAKE_DIR}/NvidiaCutlassTargets.cmake")

# For backward compatibility with the old name
add_library(cutlass_lib ALIAS cutlass_library)
1 change: 0 additions & 1 deletion examples/08_turing_tensorop_gemm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,5 @@
cutlass_example_add_executable(
08_turing_tensorop_gemm
turing_tensorop_gemm.cu
DISABLE_TESTS ON
)

5 changes: 2 additions & 3 deletions examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,8 @@ int run() {
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementComputeEpilogue,
ElementComputeEpilogue>
int32_t,
int32_t>
gemm_device;

// Launch device reference gemm kernel
Expand Down Expand Up @@ -355,4 +355,3 @@ int main() {

return run();
}

Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ compare if the output from CUTLASS kernel is same as the reference implicit GEMM
#include "cutlass/util/tensor_view_io.h"

#include "helper.h"

// The code section below describes datatype for input, output tensors and computation between
// elements
using ElementAccumulator = int32_t; // Data type of accumulator
Expand Down Expand Up @@ -675,7 +674,6 @@ Result profile_convolution(Options const &options) {

return result;
}

/////////////////////////////////////////////////////////////////////////////////////////////////

int main(int argc, char const **args) {
Expand Down Expand Up @@ -762,11 +760,7 @@ int main(int argc, char const **args) {
Result::print_header(std::cout, options) << std::endl;
result.print(std::cout, 1, options) << std::endl;
}

return 0;
}

/////////////////////////////////////////////////////////////////////////////////////////////////



1 change: 0 additions & 1 deletion examples/12_gemm_bias_relu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,5 @@
cutlass_example_add_executable(
12_gemm_bias_relu
gemm_bias_relu.cu
DISABLE_TESTS ON
)

Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,6 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75_rf_res() {

return pass;
}

int main() {

std::vector<bool (*)()>funcs = {
Expand All @@ -229,10 +228,6 @@ int main() {
};

return testRun(75, funcs, "conv int8 RF residency");

}



////////////////////////////////////////////////////////////////////////////////

Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_interleaved_conv2d_run.h"
#include "test_run.h"

////////////////////////////////////////////////////////////////////////////////

cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
Expand Down Expand Up @@ -219,20 +218,13 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75_shmem() {

return pass;
}


int main() {

std::vector<bool (*)()>funcs = {
&run_nonfused_conv2d_fprop_optimized_s8_sm75,
&run_fused_conv2d_fprop_optimized_s8_sm75_shmem
};

return testRun(75, funcs, "conv int8 shmem staging");

}



////////////////////////////////////////////////////////////////////////////////

Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@ bool run_fused_gemm_s8_rf_res() {
return passed;

}

int main() {

std::vector<bool (*)()>funcs = {
Expand All @@ -204,9 +203,6 @@ int main() {
};

return testRun(75, funcs, "gemm int8 RF residency");


}


////////////////////////////////////////////////////////////////////////////////
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
#include "device/b2b_gemm.h"
#include "b2b_interleaved_gemm_run.h"
#include "test_run.h"

////////////////////////////////////////////////////////////////////////////////

cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*640, 64, 576);
Expand Down Expand Up @@ -197,18 +196,13 @@ bool run_fused_gemm_s8_shmem() {
return passed;

}

int main() {

std::vector<bool (*)()>funcs = {
&run_nonfused_gemm_s8,
&run_fused_gemm_s8_shmem
};

return testRun(75, funcs, "gemm int8 shmem staing");


}


////////////////////////////////////////////////////////////////////////////////
Loading

0 comments on commit 90d3b0f

Please sign in to comment.