Skip to content

Commit

Permalink
CUTLASS 2.6 (NVIDIA#298)
Browse files Browse the repository at this point in the history
CUTLASS 2.6
  • Loading branch information
Manish Gupta authored Jul 23, 2021
1 parent 6c29fe2 commit e5d5184
Show file tree
Hide file tree
Showing 308 changed files with 33,002 additions and 5,316 deletions.
27 changes: 27 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,33 @@

# CUTLASS 2.x

## [2.6.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.6.0) (2021-07-22)
* Optimal performance when compiled with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit)
* Adopt the new L2 prefetch feature in [cp.async](/include/cutlass/arch/memory.h) and [global load](/include/cutlass/arch/memory_sm80.h)
* Fused operators with GEMM and Convolution
* [Fused broadcast in epilogue](test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu)
* [Fused partial reduction in epilogue](/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu)
* 64b tensor strides and leading dimensions support for GEMMs
* Affine rank=2 matrix layouts
* Row stride and column stride for matrices using [cutlass::layout::AffineRank2](/include/cutlass/layout/matrix.h)
* Support [FP64 tensor core](/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu) and SIMT GEMM.
* [Batched GEMV](/test/unit/gemm/device/gemv.cu) preview implementation
* [New strided Dgrad](test/unit/gemm/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) implementation
* Accelerates over previous implementation by cutting down redundant math by 4x
* Support using new `Dy` and `w` analytic iterators and existing `cutlass::conv::device::ImplicitGemmConvolution` iterface
* Quaternion-valued GEMM and Convolution in single- and double-precision (targeting CUDA Cores)
* Updates to [quaternion.h](/include/cutlass/quaternion.h) and [functional.h](/include/cutlass/functional.h)
* SDK Example for [GEMM](/examples/21_quaternion_gemm/quaternion_gemm.cu) and [Convolution](/examples/22_quaternion_gemm/quaternion_conv.cu)
* [Unit tests for GEMM](/test/unit/gemm/device/simt_qgemm_nn_sm50.cu) and [Convolution](/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu)
* Many improvements to the epilogue.
* Provide an [option](/include/cutlass/epilogue/threadblock/epilogue.h) to not fully unroll the epilogue to reduce the code size and improve the performance when using complicated elementwise operations
* Performance improvement for FP16 tensor core kernels
* Bug fixes
* Updated minimum CUDA Toolkit requirement to 10.2
* [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit) recommended
* Corrections and bug fixes reported by the CUTLASS community
* Thank you for filing these issues!

## [2.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.5.0) (2021-02-26)
* Tensor reductions
* _m_-to-_n_ reductions of tensors with affine layout
Expand Down
39 changes: 35 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,15 @@ endif()

message(STATUS "CMake Version: ${CMAKE_VERSION}")

project(CUTLASS VERSION 2.5.0 LANGUAGES CXX)
project(CUTLASS VERSION 2.6.0 LANGUAGES CXX)
include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)

if (CUDA_VERSION VERSION_LESS 10.2)
message(WARNING "CUTLASS ${CUTLASS_VERSION} requires CUDA 10.2 or higher, and strongly recommends CUDA 11.0 or higher.")
elseif (CUDA_VERSION VERSION_LESS 11.0)
message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.0 or higher.")
endif()

find_package(Doxygen QUIET)

#
Expand Down Expand Up @@ -105,7 +111,7 @@ endif()
if (NOT CUDA_VERSION VERSION_LESS 11.0)
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 80)
endif()
if (NOT CUDA_VERSION VERSION_LESS 11.1)
if (NOT CUDA_VERSION VERSION_LESS 11.1 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 86)
endif()
set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
Expand Down Expand Up @@ -275,7 +281,14 @@ if(CUDA_COMPILER MATCHES "[Cc]lang")
message(FATAL_ERROR "Clang 7.0+ required for GPU compilation")
endif()

# There are numerous Clang versions that can work with each CUDA toolkit and the
# the checks are not very useful so we are turning them off and using testing to
# ensure the various combinations work properly.

list(APPEND CUTLASS_CUDA_CLANG_FLAGS --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -D__NV_NO_HOST_COMPILER_CHECK=1)
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wno-unknown-cuda-version)

list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -pragma-unroll-threshold=100000)
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -unroll-threshold=5000)
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wno-unused-command-line-argument)
Expand All @@ -294,18 +307,28 @@ if(CUDA_COMPILER MATCHES "[Cc]lang")
link_libraries(nvidia::cudart)
endif()

if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
# CMake 3.18 added support for CUDA_ARCHITECTURES target property. We will use this
# property for CMake 3.18+, so we request the NEW behavior for correct compatibility.
# https://cmake.org/cmake/help/v3.18/policy/CMP0104.html#policy:CMP0104
cmake_policy(SET CMP0104 NEW)
endif()

function(cutlass_apply_cuda_gencode_flags TARGET)

set(NVCC_FLAGS)
set(CLANG_FLAGS)
set(__CMAKE_CUDA_ARCHS)
foreach(ARCH ${CUTLASS_NVCC_ARCHS_ENABLED})
list(APPEND CLANG_FLAGS --cuda-gpu-arch=sm_${ARCH})
set(CODES)
if(CUTLASS_NVCC_EMBED_CUBIN)
list(APPEND CODES sm_${ARCH})
list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-real)
endif()
if(CUTLASS_NVCC_EMBED_PTX)
list(APPEND CODES compute_${ARCH})
list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-virtual)
endif()
list(JOIN CODES "," CODES_STR)
list(APPEND NVCC_FLAGS -gencode=arch=compute_${ARCH},code=[${CODES_STR}])
Expand All @@ -317,6 +340,8 @@ function(cutlass_apply_cuda_gencode_flags TARGET)
PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:${CLANG_FLAGS}>
)
elseif(CMAKE_VERSION GREATER_EQUAL 3.18)
set_property(TARGET ${TARGET} PROPERTY CUDA_ARCHITECTURES ${__CMAKE_CUDA_ARCHS})
else()
target_compile_options(
${TARGET}
Expand Down Expand Up @@ -542,10 +567,14 @@ function(cutlass_add_executable_tests NAME TARGET)
#

set(options DISABLE_EXECUTABLE_INSTALL_RULE)
set(oneValueArgs)
set(oneValueArgs DISABLE_TESTS)
set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})


if (NOT DEFINED __DISABLE_TESTS)
set(__DISABLE_TESTS OFF)
endif()

if (NOT __DISABLE_EXECUTABLE_INSTALL_RULE AND CUTLASS_INSTALL_TESTS)

# file(RELATIVE_PATH CMAKE_CURRENT_BINARY_RELATIVE_DIR ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
Expand Down Expand Up @@ -610,6 +639,8 @@ function(cutlass_add_executable_tests NAME TARGET)
COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
)

set_tests_properties(c${TEST_NAME} PROPERTIES DISABLED ${__DISABLE_TESTS})

if (CUTLASS_INSTALL_TESTS)

# To run the tests from an install package with tests enabled, we need to generate test files
Expand Down
33 changes: 25 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")

# CUTLASS 2.5
# CUTLASS 2.6

_CUTLASS 2.5 - February 2021_
_CUTLASS 2.6 - July 2021_

CUTLASS is a collection of CUDA C++ template abstractions for implementing
high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
Expand Down Expand Up @@ -34,12 +34,24 @@ See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
See the [functionality listing](/media/docs/functionality.md) for the list of operations
supported at each level of the execution model hierarchy.

# What's New in CUTLASS 2.6
CUTLASS 2.6 is a minor update to CUTLASS adding:
- Fused [broadcast](test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu) and [reductions](/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu) in the epilogues of GEMM and Convolution
- [Quaternion-valued GEMM](/examples/21_quaternion_gemm/quaternion_gemm.cu) and [Convolution](/examples/22_quaternion_conv/quaternion_conv.cu) in single-precision
- [New strided Dgrad](test/unit/gemm/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) implementation offers up to 4x performance improvements over previous strided Dgrad
- 64-bit strides for large tensor allocations
- [General affine layouts](/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu) fp64 tensor core and simt GEMM
- Enhanced functionality, boosted performance, and bug fixes in the epilogue.
- Optimal performance when compiled with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit)
- Adopt new L2 prefetch feature in [ptx instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-isa-version-7-4).
- Numerous updates from the community (thanks!)
- See the [CHANGELOG](CHANGELOG.md) for more details

# What's New in CUTLASS 2.5
CUTLASS 2.5 is a minor update to CUTLASS adding:
- [Tensor reductions](/test/unit/reduction/device/tensor_reduce_contiguous.cu)
- [Optimizations for 3-D convolution](include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h)
- [Fused Convolution+Convolution example](/examples/13_two_tensor_op_fusion/README.md)
- See the [CHANGELOG](CHANGELOG.md) for more details

# What's New in CUTLASS 2.4
CUTLASS 2.4 is a significant update to CUTLASS adding:
Expand All @@ -52,7 +64,7 @@ CUTLASS 2.4 is a significant update to CUTLASS adding:
CUTLASS 2.3 is a minor update to CUTLASS adding:
- GEMMs targeting structured [Sparse Tensor Cores](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) in NVIDIA Ampere Architecture GPUs
- Fast SGEMM kernels targeting GeForce RTX 30-series CUDA Cores
- Intended to be compiled with [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit)
- Intended to be compiled with [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit) or later

# What's New in CUTLASS 2.2

Expand All @@ -62,7 +74,7 @@ CUTLASS 2.2 is a significant update to CUTLASS adding:
- Tensor Core-accelerated GEMMs targeting Tensor Float 32, BFloat16, and double-precision data types
- Deep software pipelines using asynchronous copy
- Described in [GTC 2020 Webinar (SR 21745)](https://developer.nvidia.com/gtc/2020/video/s21745)
- Intended to be compiled with [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit)
- Intended to be compiled with [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit) or later

# What's New in CUTLASS 2.1

Expand Down Expand Up @@ -95,8 +107,8 @@ using CUDA 11.0 Toolkit. Tensor Core operations are implemented using CUDA's
# Compatibility

CUTLASS requires a C++11 host compiler and
performs best when built with the [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit).
It is compatible with CUDA 9.2, CUDA 10.0, CUDA 10.1, CUDA 10.2, and CUDA 11.0.
performs best when built with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit).
It is also compatible with CUDA 10.2, CUDA 11.0, CUDA 11.1, CUDA 11.2, and CUDA 11.3.

We have tested the following environments.

Expand All @@ -106,12 +118,16 @@ We have tested the following environments.
| | Microsoft Visual Studio 2017|
| Ubuntu 16.04 | GCC 5.4.0 |
| Ubuntu 18.04 | GCC 7.5.0 |
| Ubuntu 20.04 | GCC 10.2.0 |

Additionally, CUTLASS may be built with clang.
See [these instructions](media/docs/quickstart.md#clang) for more details.

CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on
any Maxwell-, Pascal-, Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU.
any Maxwell-, Pascal-, Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU.

For all GPUs, we recommend compiling with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit)
for best performance.

|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit**|**CUDA Toolkit Enabling Native Tensor Cores**|
|---|---|---|---|
Expand Down Expand Up @@ -511,6 +527,7 @@ CUTLASS is released by NVIDIA Corporation as Open Source software under the

The official list of CUTLASS developers and contributors is available here: [CONTRIBUTORS](CONTRIBUTORS.md).


# Copyright

Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
Expand Down
2 changes: 2 additions & 0 deletions cmake/CTestTestfile.config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ add_test("@TEST_NAME@" ${_CUTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}"
if (NOT "@TEST_EXE_WORKING_DIRECTORY@" STREQUAL "")
set_tests_properties("@TEST_NAME@" PROPERTIES WORKING_DIRECTORY "@TEST_EXE_WORKING_DIRECTORY@")
endif()

set_tests_properties(@TEST_NAME@ PROPERTIES DISABLED @__DISABLE_TESTS@)
6 changes: 3 additions & 3 deletions examples/01_cutlass_utilities/cutlass_utilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,12 @@ cudaError_t cutlass_hgemm_nn(
int K,
cutlass::half_t alpha,
cutlass::half_t const *A,
int lda,
cutlass::layout::ColumnMajor::Stride::Index lda,
cutlass::half_t const *B,
int ldb,
cutlass::layout::ColumnMajor::Stride::Index ldb,
cutlass::half_t beta,
cutlass::half_t *C,
int ldc) {
cutlass::layout::ColumnMajor::Stride::Index ldc) {

// Define the GEMM operation
using Gemm = cutlass::gemm::device::Gemm<
Expand Down
2 changes: 1 addition & 1 deletion examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ beta * C).
Now that we setup the properties of data, we have to setup properties of computation.
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
64x64x4, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
Expand Down
8 changes: 4 additions & 4 deletions examples/10_planar_complex/planar_complex.cu
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,10 @@ public:
int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;

int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);

int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
Expand Down
9 changes: 5 additions & 4 deletions examples/11_planar_complex_array/planar_complex_array.cu
Original file line number Diff line number Diff line change
Expand Up @@ -292,10 +292,11 @@ public:
int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;

int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);


int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
Expand Down
4 changes: 4 additions & 0 deletions examples/13_two_tensor_op_fusion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ addition to its own input activation tile. Therefore the input activation warp t
2nd GEMM/Conv only depends on the output warp accumulator of the 1st GEMM/Conv in the
register file, and the operation can be fully register-file-resident.

When applying the above constraint to convolutions, it is required that the 2nd Convolution
kernel doesn't have halos such that data used by each threadblock doesn't depend on any other
threadblock. Typically this requires the 2nd Convolution uses 1x1 filter without any paddings.

# Copyright

Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
Expand Down
Loading

0 comments on commit e5d5184

Please sign in to comment.