CUTLASS 2.6 (NVIDIA#298)

CUTLASS 2.6
vkarihal · Jul 23, 2021 · e5d5184 · e5d5184
1 parent 6c29fe2
commit e5d5184
Show file tree

Hide file tree

Showing 308 changed files with 33,002 additions and 5,316 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,33 @@
 
 # CUTLASS 2.x
 
+## [2.6.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.6.0) (2021-07-22)
+  * Optimal performance when compiled with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit)
+    * Adopt the new L2 prefetch feature in [cp.async](/include/cutlass/arch/memory.h) and [global load](/include/cutlass/arch/memory_sm80.h)
+  * Fused operators with GEMM and Convolution
+    * [Fused broadcast in epilogue](test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu)
+    * [Fused partial reduction in epilogue](/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu)
+  * 64b tensor strides and leading dimensions support for GEMMs
+  * Affine rank=2 matrix layouts 
+    * Row stride and column stride for matrices using [cutlass::layout::AffineRank2](/include/cutlass/layout/matrix.h)
+    * Support [FP64 tensor core](/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu) and SIMT GEMM.
+  * [Batched GEMV](/test/unit/gemm/device/gemv.cu) preview implementation
+  * [New strided Dgrad](test/unit/gemm/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) implementation
+    * Accelerates over previous implementation by cutting down redundant math by 4x
+    * Support using new `Dy` and `w` analytic iterators and existing `cutlass::conv::device::ImplicitGemmConvolution` iterface
+  * Quaternion-valued GEMM and Convolution in single- and double-precision (targeting CUDA Cores)
+    * Updates to [quaternion.h](/include/cutlass/quaternion.h) and [functional.h](/include/cutlass/functional.h)
+    * SDK Example for [GEMM](/examples/21_quaternion_gemm/quaternion_gemm.cu) and [Convolution](/examples/22_quaternion_gemm/quaternion_conv.cu)
+    * [Unit tests for GEMM](/test/unit/gemm/device/simt_qgemm_nn_sm50.cu) and [Convolution](/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu)
+  * Many improvements to the epilogue.
+    * Provide an [option](/include/cutlass/epilogue/threadblock/epilogue.h) to not fully unroll the epilogue to reduce the code size and improve the performance when using complicated elementwise operations
+    * Performance improvement for FP16 tensor core kernels
+    * Bug fixes    
+  * Updated minimum CUDA Toolkit requirement to 10.2
+    * [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit) recommended
+  * Corrections and bug fixes reported by the CUTLASS community
+    * Thank you for filing these issues!
+
 ## [2.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.5.0) (2021-02-26)
   * Tensor reductions
     * _m_-to-_n_ reductions of tensors with affine layout

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,9 +32,15 @@ endif()
 
 message(STATUS "CMake Version: ${CMAKE_VERSION}")
 
-project(CUTLASS VERSION 2.5.0 LANGUAGES CXX)
+project(CUTLASS VERSION 2.6.0 LANGUAGES CXX)
 include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
 
+if (CUDA_VERSION VERSION_LESS 10.2)
+  message(WARNING "CUTLASS ${CUTLASS_VERSION} requires CUDA 10.2 or higher, and strongly recommends CUDA 11.0 or higher.")
+elseif (CUDA_VERSION VERSION_LESS 11.0)
+  message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.0 or higher.")
+endif()
+
 find_package(Doxygen QUIET)
 
 #
@@ -105,7 +111,7 @@ endif()
 if (NOT CUDA_VERSION VERSION_LESS 11.0)
   list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 80)
 endif()
-if (NOT CUDA_VERSION VERSION_LESS 11.1)
+if (NOT CUDA_VERSION VERSION_LESS 11.1 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
   list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 86)
 endif()
 set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
@@ -275,7 +281,14 @@ if(CUDA_COMPILER MATCHES "[Cc]lang")
     message(FATAL_ERROR "Clang 7.0+ required for GPU compilation")
   endif()
 
+  # There are numerous Clang versions that can work with each CUDA toolkit and the 
+  # the checks are not very useful so we are turning them off and using testing to 
+  # ensure the various combinations work properly.
+
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+  list(APPEND CUTLASS_CUDA_CLANG_FLAGS -D__NV_NO_HOST_COMPILER_CHECK=1)
+  list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wno-unknown-cuda-version)
+
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -pragma-unroll-threshold=100000)
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -unroll-threshold=5000)
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wno-unused-command-line-argument)
@@ -294,18 +307,28 @@ if(CUDA_COMPILER MATCHES "[Cc]lang")
   link_libraries(nvidia::cudart)
 endif()
 
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+  # CMake 3.18 added support for CUDA_ARCHITECTURES target property. We will use this
+  # property for CMake 3.18+, so we request the NEW behavior for correct compatibility.
+  # https://cmake.org/cmake/help/v3.18/policy/CMP0104.html#policy:CMP0104 
+  cmake_policy(SET CMP0104 NEW)
+endif()
+
 function(cutlass_apply_cuda_gencode_flags TARGET)
 
   set(NVCC_FLAGS)
   set(CLANG_FLAGS)
+  set(__CMAKE_CUDA_ARCHS)
   foreach(ARCH ${CUTLASS_NVCC_ARCHS_ENABLED})
     list(APPEND CLANG_FLAGS --cuda-gpu-arch=sm_${ARCH})
     set(CODES)
     if(CUTLASS_NVCC_EMBED_CUBIN)
       list(APPEND CODES sm_${ARCH})
+      list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-real)
     endif()
     if(CUTLASS_NVCC_EMBED_PTX)
       list(APPEND CODES compute_${ARCH})
+      list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-virtual)
     endif()
     list(JOIN CODES "," CODES_STR)
     list(APPEND NVCC_FLAGS -gencode=arch=compute_${ARCH},code=[${CODES_STR}])
@@ -317,6 +340,8 @@ function(cutlass_apply_cuda_gencode_flags TARGET)
       PRIVATE
       $<$<COMPILE_LANGUAGE:CXX>:${CLANG_FLAGS}>
       )
+  elseif(CMAKE_VERSION GREATER_EQUAL 3.18)
+    set_property(TARGET ${TARGET} PROPERTY CUDA_ARCHITECTURES ${__CMAKE_CUDA_ARCHS})
   else()
     target_compile_options(
       ${TARGET}
@@ -542,10 +567,14 @@ function(cutlass_add_executable_tests NAME TARGET)
 # 
 
   set(options DISABLE_EXECUTABLE_INSTALL_RULE)
-  set(oneValueArgs)
+  set(oneValueArgs DISABLE_TESTS)
   set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
+
+  if (NOT DEFINED __DISABLE_TESTS)
+    set(__DISABLE_TESTS OFF)
+  endif()
+
   if (NOT __DISABLE_EXECUTABLE_INSTALL_RULE AND CUTLASS_INSTALL_TESTS)
 
     # file(RELATIVE_PATH CMAKE_CURRENT_BINARY_RELATIVE_DIR ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
@@ -610,6 +639,8 @@ function(cutlass_add_executable_tests NAME TARGET)
       COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
       )
 
+    set_tests_properties(c${TEST_NAME} PROPERTIES DISABLED ${__DISABLE_TESTS})
+
     if (CUTLASS_INSTALL_TESTS)
 
       # To run the tests from an install package with tests enabled, we need to generate test files

diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS 2.5
+# CUTLASS 2.6
 
-_CUTLASS 2.5 - February 2021_
+_CUTLASS 2.6 - July 2021_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
@@ -34,12 +34,24 @@ See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
 See the [functionality listing](/media/docs/functionality.md) for the list of operations
 supported at each level of the execution model hierarchy.
 
+# What's New in CUTLASS 2.6
+CUTLASS 2.6 is a minor update to CUTLASS adding:
+- Fused [broadcast](test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu) and [reductions](/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu) in the epilogues of GEMM and Convolution
+- [Quaternion-valued GEMM](/examples/21_quaternion_gemm/quaternion_gemm.cu) and [Convolution](/examples/22_quaternion_conv/quaternion_conv.cu) in single-precision
+- [New strided Dgrad](test/unit/gemm/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) implementation offers up to 4x performance improvements over previous strided Dgrad
+- 64-bit strides for large tensor allocations
+- [General affine layouts](/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu) fp64 tensor core and simt GEMM
+- Enhanced functionality, boosted performance, and bug fixes in the epilogue.
+- Optimal performance when compiled with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit)
+- Adopt new L2 prefetch feature in [ptx instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-isa-version-7-4).
+- Numerous updates from the community (thanks!)
+- See the [CHANGELOG](CHANGELOG.md) for more details
+
 # What's New in CUTLASS 2.5
 CUTLASS 2.5 is a minor update to CUTLASS adding:
 - [Tensor reductions](/test/unit/reduction/device/tensor_reduce_contiguous.cu)
 - [Optimizations for 3-D convolution](include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h)
 - [Fused Convolution+Convolution example](/examples/13_two_tensor_op_fusion/README.md)
-- See the [CHANGELOG](CHANGELOG.md) for more details
 
 # What's New in CUTLASS 2.4
 CUTLASS 2.4 is a significant update to CUTLASS adding:
@@ -52,7 +64,7 @@ CUTLASS 2.4 is a significant update to CUTLASS adding:
 CUTLASS 2.3 is a minor update to CUTLASS adding:
 - GEMMs targeting structured [Sparse Tensor Cores](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) in NVIDIA Ampere Architecture GPUs
 - Fast SGEMM kernels targeting GeForce RTX 30-series CUDA Cores
-- Intended to be compiled with [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit)
+- Intended to be compiled with [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit) or later
 
 # What's New in CUTLASS 2.2
 
@@ -62,7 +74,7 @@ CUTLASS 2.2 is a significant update to CUTLASS adding:
 - Tensor Core-accelerated GEMMs targeting Tensor Float 32, BFloat16, and double-precision data types
 - Deep software pipelines using asynchronous copy
 - Described in [GTC 2020 Webinar (SR 21745)](https://developer.nvidia.com/gtc/2020/video/s21745)
-- Intended to be compiled with [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit)
+- Intended to be compiled with [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit) or later
 
 # What's New in CUTLASS 2.1
 
@@ -95,8 +107,8 @@ using CUDA 11.0 Toolkit. Tensor Core operations are implemented using CUDA's
 # Compatibility
 
 CUTLASS requires a C++11 host compiler and 
-performs best when built with the [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit).
-It is compatible with CUDA 9.2, CUDA 10.0, CUDA 10.1, CUDA 10.2, and CUDA 11.0.
+performs best when built with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit).
+It is also compatible with CUDA 10.2, CUDA 11.0, CUDA 11.1, CUDA 11.2, and CUDA 11.3.
 
 We have tested the following environments.
 
@@ -106,12 +118,16 @@ We have tested the following environments.
 |                 | Microsoft Visual Studio 2017|
 | Ubuntu 16.04 | GCC 5.4.0 |
 | Ubuntu 18.04 | GCC 7.5.0 |
+| Ubuntu 20.04 | GCC 10.2.0 |
 
 Additionally, CUTLASS may be built with clang. 
 See [these instructions](media/docs/quickstart.md#clang) for more details.
 
 CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on
-any Maxwell-, Pascal-, Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU.
+any Maxwell-, Pascal-, Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU. 
+
+For all GPUs, we recommend compiling with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit)
+for best performance. 
 
 |**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit**|**CUDA Toolkit Enabling Native Tensor Cores**|
 |---|---|---|---|
@@ -511,6 +527,7 @@ CUTLASS is released by NVIDIA Corporation as Open Source software under the
 
 The official list of CUTLASS developers and contributors is available here: [CONTRIBUTORS](CONTRIBUTORS.md).
 
+
 # Copyright
 
 Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.

diff --git a/cmake/CTestTestfile.config.cmake b/cmake/CTestTestfile.config.cmake
@@ -17,3 +17,5 @@ add_test("@TEST_NAME@" ${_CUTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}"
 if (NOT "@TEST_EXE_WORKING_DIRECTORY@" STREQUAL "")
   set_tests_properties("@TEST_NAME@" PROPERTIES WORKING_DIRECTORY "@TEST_EXE_WORKING_DIRECTORY@")
 endif()
+
+set_tests_properties(@TEST_NAME@ PROPERTIES DISABLED @__DISABLE_TESTS@)
diff --git a/examples/01_cutlass_utilities/cutlass_utilities.cu b/examples/01_cutlass_utilities/cutlass_utilities.cu
@@ -119,12 +119,12 @@ cudaError_t cutlass_hgemm_nn(
   int K,
   cutlass::half_t alpha,
   cutlass::half_t const *A,
-  int lda,
+  cutlass::layout::ColumnMajor::Stride::Index lda,
   cutlass::half_t const *B,
-  int ldb,
+  cutlass::layout::ColumnMajor::Stride::Index ldb,
   cutlass::half_t beta,
   cutlass::half_t *C,
-  int ldc) {
+  cutlass::layout::ColumnMajor::Stride::Index ldc) {
 
   // Define the GEMM operation
   using Gemm = cutlass::gemm::device::Gemm<

diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
@@ -67,7 +67,7 @@ beta * C).
 Now that we setup the properties of data, we have to setup properties of computation.
 
 Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
-64x64x4, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
+64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
 deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
 bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
 high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from

diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu
@@ -275,10 +275,10 @@ public:
     int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
     int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
 
-    int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
-    int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
-    int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
-    int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
 
     int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
     int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();

diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu
@@ -292,10 +292,11 @@ public:
     int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
     int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
 
-    int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
-    int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
-    int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
-    int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+
 
     int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
     int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();

diff --git a/examples/13_two_tensor_op_fusion/README.md b/examples/13_two_tensor_op_fusion/README.md
@@ -48,6 +48,10 @@ addition to its own input activation tile. Therefore the input activation warp t
 2nd GEMM/Conv only depends on the output warp accumulator of the 1st GEMM/Conv in the
 register file, and the operation can be fully register-file-resident.
 
+When applying the above constraint to convolutions, it is required that the 2nd Convolution
+kernel doesn't have halos such that data used by each threadblock doesn't depend on any other
+threadblock. Typically this requires the 2nd Convolution uses 1x1 filter without any paddings.
+
 # Copyright
 
 Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.