diff --git a/CHANGELOG.md b/CHANGELOG.md
index d90f71378b..bdee14f0fb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,19 @@
 # NVIDIA CUTLASS Changelog
 
 # CUTLASS 2.x
+
+## [2.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.5.0) (2021-02-26)
+  * Tensor reductions
+    * User-supplied reduction operations across one or more dimensions of tensors with affine layouts
+    * Optimizations for vectorized memory accesses
+    * Large tensor support, up to 2^63 elements (however, each dimension is limited to an extent of 2^31)
+  * Fused inlined operations on Convolution input
+    * Vector broadcast and transformation on Convolution input
+  * Optimizations for 3-D convolution
+    * Tile iterators using precomputed delta table for three spatial dimensions
+    * Performance parity with 2-D convolution implementation
+  
+
 ## [2.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.4.0) (2020-11-19)
   * Implicit GEMM convolution kernels supporting CUDA and Tensor Cores on NVIDIA GPUs
     * Operators: forward (Fprop), backward data gradient (Dgrad), and backward weight gradient (Wgrad) convolution
@@ -126,7 +139,7 @@
 
 ## Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0ece82c6d..4abf54a986 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -32,7 +32,7 @@ endif()
 
 message(STATUS "CMake Version: ${CMAKE_VERSION}")
 
-project(CUTLASS VERSION 2.4.0 LANGUAGES CXX)
+project(CUTLASS VERSION 2.5.0 LANGUAGES CXX)
 include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
 
 find_package(Doxygen QUIET)
@@ -67,6 +67,8 @@ else()
   set(CUTLASS_ENABLE_TOOLS_INIT ON)
 endif()
 
+set(CUTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
+
 set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable CUTLASS Examples")
 set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools")
 set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_TOOLS} CACHE BOOL "Enable CUTLASS Library")
@@ -114,10 +116,6 @@ if (POLICY CMP0076)
   cmake_policy(SET CMP0076 NEW)
 endif()
 
-if( NOT CMAKE_SIZEOF_VOID_P EQUAL 8 )
-    message(FATAL_ERROR "CUTLASS requires a 64-bit compiler!")
-endif()
-
 include(GNUInstallDirs)
 
 link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
@@ -257,6 +255,17 @@ if (NOT CMAKE_BUILD_TYPE MATCHES "Release")
   list(APPEND CUTLASS_CUDA_NVCC_FLAGS -lineinfo)
 endif()
 
+#Report CUDA build flags
+if (CUDA_COMPILER MATCHES "[Cc]lang")
+  if(CUTLASS_CUDA_CLANG_FLAGS)
+    message(STATUS "Using CLANG flags: ${CUTLASS_CUDA_CLANG_FLAGS}")
+  endif()
+else()
+  if(CUTLASS_CUDA_NVCC_FLAGS)
+    message(STATUS "Using NVCC flags: ${CUTLASS_CUDA_NVCC_FLAGS}")
+  endif()
+endif()
+
 if(CUDA_COMPILER MATCHES "[Cc]lang")
   if( NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang" )
     message(FATAL_ERROR "Clang CUDA compilation requires Clang CXX compilation. Currently CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER_ID}" )
@@ -318,20 +327,35 @@ function(cutlass_apply_cuda_gencode_flags TARGET)
 
 endfunction()
 
+# Cache the flags so they are available when the function below is called anywhere globally. 
+
+set(__CUTLASS_CUDA_FLAGS ${CUTLASS_CUDA_FLAGS} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_CLANG_FLAGS ${CUTLASS_CUDA_CLANG_FLAGS} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_CLANG_FLAGS_RELEASE ${CUTLASS_CUDA_CLANG_FLAGS_RELEASE} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_CLANG_FLAGS_DEBUG ${CUTLASS_CUDA_CLANG_FLAGS_DEBUG} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_NVCC_FLAGS ${CUTLASS_CUDA_NVCC_FLAGS} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_NVCC_FLAGS_RELEASE ${CUTLASS_CUDA_NVCC_FLAGS_RELEASE} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_NVCC_FLAGS_DEBUG ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG} CACHE INTERNAL "")
+
 function(cutlass_apply_standard_compile_options TARGET)
 
   if(CUDA_COMPILER MATCHES "[Cc]lang")
     set(CUDA_COMPILE_LANGUAGE CXX)
-    set(_FLAGS ${CUTLASS_CUDA_FLAGS} ${CUTLASS_CUDA_CLANG_FLAGS})
-    set(_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} ${CUTLASS_CUDA_CLANG_FLAGS_RELEASE})
-    set(_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO})
-    set(_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} ${CUTLASS_CUDA_CLANG_FLAGS_DEBUG})
+    set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_CLANG_FLAGS})
+    set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_CLANG_FLAGS_RELEASE})
+    set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO})
+    set(_FLAGS_DEBUG ${__CUTLASS_CUDA_FLAGS_DEBUG} ${__CUTLASS_CUDA_CLANG_FLAGS_DEBUG})
   else()
     set(CUDA_COMPILE_LANGUAGE CUDA)
-    set(_FLAGS ${CUTLASS_CUDA_FLAGS} ${CUTLASS_CUDA_NVCC_FLAGS})
-    set(_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} ${CUTLASS_CUDA_NVCC_FLAGS_RELEASE})
-    set(_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO})
-    set(_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG})
+    set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_NVCC_FLAGS})
+    set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_NVCC_FLAGS_RELEASE})
+    set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO})
+    set(_FLAGS_DEBUG ${__CUTLASS_CUDA_FLAGS_DEBUG} ${__CUTLASS_CUDA_NVCC_FLAGS_DEBUG})
   endif()
 
   target_compile_options(
@@ -464,20 +488,6 @@ endif()
 
 ################################################################################
 
-include(${CMAKE_CURRENT_SOURCE_DIR}/cuBLAS.cmake)
-
-if (CUTLASS_ENABLE_CUBLAS)
-  target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUBLAS=1)
-endif()
-
-include(${CMAKE_CURRENT_SOURCE_DIR}/cuDNN.cmake)
-
-if (CUTLASS_ENABLE_CUDNN)
-  target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUDNN=1)
-endif()
-
-################################################################################
-
 include(CTest)
 enable_testing()
 if (NOT TARGET test_all)
@@ -497,6 +507,22 @@ install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR})
 install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_LIBDIR})
 install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest)
 
+################################################################################
+
+include(${CMAKE_CURRENT_SOURCE_DIR}/cuBLAS.cmake)
+
+if (CUTLASS_ENABLE_CUBLAS)
+  target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUBLAS=1)
+endif()
+
+include(${CMAKE_CURRENT_SOURCE_DIR}/cuDNN.cmake)
+
+if (CUTLASS_ENABLE_CUDNN)
+  target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUDNN=1)
+endif()
+
+################################################################################
+
 set(CUTLASS_CTEST_TEMPLATE_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/CTestTestfile.config.cmake)
 set(CUTLASS_CTEST_GENERATED_FILES "" CACHE INTERNAL "")
 
diff --git a/CUDA.cmake b/CUDA.cmake
index c887178a89..3578989a23 100644
--- a/CUDA.cmake
+++ b/CUDA.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -204,7 +204,7 @@ include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 # paths by default, so we add it explicitly here.
 
 function(cutlass_correct_source_file_language_property)
-  if(CUDA_COMPILER MATCHES "clang")
+  if(CUDA_COMPILER MATCHES "[Cc]lang")
     foreach(File ${ARGN})
       if(File MATCHES ".*\.cu$")
         set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
diff --git a/README.md b/README.md
index d8855c7395..d376d636ef 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS 2.4
+# CUTLASS 2.5
 
-_CUTLASS 2.4 - November 2020_
+_CUTLASS 2.5 - February 2021_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
@@ -34,12 +34,18 @@ See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
 See the [functionality listing](/media/docs/functionality.md) for the list of operations
 supported at each level of the execution model hierarchy.
 
+# What's New in CUTLASS 2.5
+CUTLASS 2.5 is a minor update to CUTLASS adding:
+- Tensor reductions
+- Fused inlined operations on Convolution input
+- Optimizations for 3-D convolution
+- See the [CHANGELOG](CHANGELOG.md) for more details
+
 # What's New in CUTLASS 2.4
 CUTLASS 2.4 is a significant update to CUTLASS adding:
 - 1-D, 2-D, and 3-D convolution targeting Tensor and CUDA cores for NVIDIA Ampere, Turing, and Volta GPU architectures
 - CUTLASS profiler support for convolution
 - [Documentation](/media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation
-- See the [CHANGELOG](CHANGELOG.md) for more details.
 
 # What's New in CUTLASS 2.3
 
@@ -47,7 +53,6 @@ CUTLASS 2.3 is a minor update to CUTLASS adding:
 - GEMMs targeting structured [Sparse Tensor Cores](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) in NVIDIA Ampere Architecture GPUs
 - Fast SGEMM kernels targeting GeForce RTX 30-series CUDA Cores
 - Intended to be compiled with [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit)
-- See the [CHANGELOG](CHANGELOG.md) for more details.
 
 # What's New in CUTLASS 2.2
 
@@ -508,7 +513,7 @@ The official list of CUTLASS developers and contributors is available here: [CON
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/cmake/nop.cu b/cmake/nop.cu
index 518a582b89..77216e5c7b 100644
--- a/cmake/nop.cu
+++ b/cmake/nop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/cuBLAS.cmake b/cuBLAS.cmake
index 0ad6db2378..0e1733f0ac 100644
--- a/cuBLAS.cmake
+++ b/cuBLAS.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/cuDNN.cmake b/cuDNN.cmake
index da5e453131..0eb8e853ab 100644
--- a/cuDNN.cmake
+++ b/cuDNN.cmake
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/00_basic_gemm/CMakeLists.txt b/examples/00_basic_gemm/CMakeLists.txt
index 9ae257d9ab..8a619b3258 100644
--- a/examples/00_basic_gemm/CMakeLists.txt
+++ b/examples/00_basic_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu
index bda012abee..1dbeef75d5 100644
--- a/examples/00_basic_gemm/basic_gemm.cu
+++ b/examples/00_basic_gemm/basic_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/01_cutlass_utilities/CMakeLists.txt b/examples/01_cutlass_utilities/CMakeLists.txt
index 5f22b7b1cf..9a1d59325c 100644
--- a/examples/01_cutlass_utilities/CMakeLists.txt
+++ b/examples/01_cutlass_utilities/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/01_cutlass_utilities/cutlass_utilities.cu b/examples/01_cutlass_utilities/cutlass_utilities.cu
index d1eaa57fe7..8d6bf6a61a 100644
--- a/examples/01_cutlass_utilities/cutlass_utilities.cu
+++ b/examples/01_cutlass_utilities/cutlass_utilities.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/02_dump_reg_shmem/CMakeLists.txt b/examples/02_dump_reg_shmem/CMakeLists.txt
index 5e6112e026..15216513aa 100644
--- a/examples/02_dump_reg_shmem/CMakeLists.txt
+++ b/examples/02_dump_reg_shmem/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/02_dump_reg_shmem/dump_reg_shmem.cu b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
index 9d7db79a95..c4276da103 100644
--- a/examples/02_dump_reg_shmem/dump_reg_shmem.cu
+++ b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  *modification, are permitted provided that the following conditions are met:
diff --git a/examples/03_visualize_layout/CMakeLists.txt b/examples/03_visualize_layout/CMakeLists.txt
index 27a87c9292..60700f5fcb 100644
--- a/examples/03_visualize_layout/CMakeLists.txt
+++ b/examples/03_visualize_layout/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/03_visualize_layout/options.h b/examples/03_visualize_layout/options.h
index dd7de198a4..4fba7a77bd 100644
--- a/examples/03_visualize_layout/options.h
+++ b/examples/03_visualize_layout/options.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/03_visualize_layout/register_layout.cu b/examples/03_visualize_layout/register_layout.cu
index 0d2b25eb30..1a761ecb3b 100644
--- a/examples/03_visualize_layout/register_layout.cu
+++ b/examples/03_visualize_layout/register_layout.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/03_visualize_layout/register_layout.h b/examples/03_visualize_layout/register_layout.h
index 1518e433c8..7eb1c778e5 100644
--- a/examples/03_visualize_layout/register_layout.h
+++ b/examples/03_visualize_layout/register_layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/03_visualize_layout/visualize_layout.cpp b/examples/03_visualize_layout/visualize_layout.cpp
index 3c4b783ca6..b8098d5038 100644
--- a/examples/03_visualize_layout/visualize_layout.cpp
+++ b/examples/03_visualize_layout/visualize_layout.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/03_visualize_layout/visualize_layout.h b/examples/03_visualize_layout/visualize_layout.h
index 4093d27721..5d96acc4c4 100644
--- a/examples/03_visualize_layout/visualize_layout.h
+++ b/examples/03_visualize_layout/visualize_layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/04_tile_iterator/CMakeLists.txt b/examples/04_tile_iterator/CMakeLists.txt
index cd32e2287a..7d22d9cf67 100644
--- a/examples/04_tile_iterator/CMakeLists.txt
+++ b/examples/04_tile_iterator/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu
index 5c56f33bd8..47aaad8f58 100644
--- a/examples/04_tile_iterator/tile_iterator.cu
+++ b/examples/04_tile_iterator/tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/05_batched_gemm/CMakeLists.txt b/examples/05_batched_gemm/CMakeLists.txt
index 6cd0ca8dba..f62f20955e 100644
--- a/examples/05_batched_gemm/CMakeLists.txt
+++ b/examples/05_batched_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/05_batched_gemm/batched_gemm.cu b/examples/05_batched_gemm/batched_gemm.cu
index a9d8a9c680..10204837bb 100644
--- a/examples/05_batched_gemm/batched_gemm.cu
+++ b/examples/05_batched_gemm/batched_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/06_splitK_gemm/CMakeLists.txt b/examples/06_splitK_gemm/CMakeLists.txt
index 7b30ae1668..e47f8df876 100644
--- a/examples/06_splitK_gemm/CMakeLists.txt
+++ b/examples/06_splitK_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/06_splitK_gemm/splitk_gemm.cu b/examples/06_splitK_gemm/splitk_gemm.cu
index b38de0c885..8aec0a294b 100644
--- a/examples/06_splitK_gemm/splitk_gemm.cu
+++ b/examples/06_splitK_gemm/splitk_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/07_volta_tensorop_gemm/CMakeLists.txt b/examples/07_volta_tensorop_gemm/CMakeLists.txt
index 82e8172271..61d5a82597 100644
--- a/examples/07_volta_tensorop_gemm/CMakeLists.txt
+++ b/examples/07_volta_tensorop_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
index ac27fa177d..23d5a95e2e 100644
--- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
+++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -284,8 +284,12 @@ int run() {
   // Instantiate CUTLASS kernel depending on templates
   Gemm gemm_op;
 
+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
   // Initialize CUTLASS kernel with arguments and workspace pointer
-  cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+  status = gemm_op.initialize(arguments, workspace.get());
   CUTLASS_CHECK(status);
 
   // Launch initialized CUTLASS kernel
diff --git a/examples/08_turing_tensorop_gemm/CMakeLists.txt b/examples/08_turing_tensorop_gemm/CMakeLists.txt
index b4e4fe82f6..b5b16ba1de 100644
--- a/examples/08_turing_tensorop_gemm/CMakeLists.txt
+++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
index 36f794d921..ba739bea01 100644
--- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -266,8 +266,12 @@ int run() {
   // Instantiate CUTLASS kernel depending on templates
   Gemm gemm_op;
 
+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
   // Initialize CUTLASS kernel with arguments and workspace pointer
-  cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+  status = gemm_op.initialize(arguments, workspace.get());
   CUTLASS_CHECK(status);
 
   // Launch initialized CUTLASS kernel
diff --git a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
index b1b5c8df1e..d529f978ea 100644
--- a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
+++ b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
index cf07efdcb5..efbca39d63 100644
--- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
+++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -485,6 +485,7 @@ Result profile_convolution(Options const &options) {
   // Split K dimension into 1 partitions
   int split_k_slices = 1;
 
+  // Construct Conv2dProblemSize with user defined output size
   cutlass::conv::Conv2dProblemSize problem_size(      
       options.input_size,
       options.filter_size,
@@ -495,6 +496,8 @@ Result profile_convolution(Options const &options) {
       mode,
       split_k_slices);
 
+  // Construct ImplicitGemm::Argument structure with conv2d 
+  // problem size, data pointers, and epilogue values
   typename ImplicitGemm::Arguments arguments{
     problem_size,
     tensor_a.device_ref(),
@@ -515,6 +518,9 @@ Result profile_convolution(Options const &options) {
   // Allocate workspace memory
   cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
 
+  result.status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(result.status);
+
   result.status = implicit_gemm_op.initialize(arguments, workspace.get());
   CUTLASS_CHECK(result.status);
 
diff --git a/examples/10_planar_complex/CMakeLists.txt b/examples/10_planar_complex/CMakeLists.txt
index 555836aebf..31e5c31a1d 100644
--- a/examples/10_planar_complex/CMakeLists.txt
+++ b/examples/10_planar_complex/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu
index d810777d9c..1ee8a069ee 100644
--- a/examples/10_planar_complex/planar_complex.cu
+++ b/examples/10_planar_complex/planar_complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/11_planar_complex_array/CMakeLists.txt b/examples/11_planar_complex_array/CMakeLists.txt
index 2a3f5987e4..082629b87a 100644
--- a/examples/11_planar_complex_array/CMakeLists.txt
+++ b/examples/11_planar_complex_array/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu
index 53134168a0..e74ba10a18 100644
--- a/examples/11_planar_complex_array/planar_complex_array.cu
+++ b/examples/11_planar_complex_array/planar_complex_array.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/12_gemm_bias_relu/CMakeLists.txt b/examples/12_gemm_bias_relu/CMakeLists.txt
index fb78d77fa2..1a02f5e6d2 100644
--- a/examples/12_gemm_bias_relu/CMakeLists.txt
+++ b/examples/12_gemm_bias_relu/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
index 2b5c779bc6..5ad0d4a0ca 100644
--- a/examples/12_gemm_bias_relu/gemm_bias_relu.cu
+++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -48,11 +48,19 @@ using ElementInputA = cutlass::half_t;              // <- data type of elements
 using ElementInputB = cutlass::half_t;              // <- data type of elements in input matrix B
 using ElementOutput = float;                        // <- data type of elements in output matrix D
 
-// The code section below describes matrix layout of input and output matrices. Column Major for
-// Matrix A, Row Major for Matrix B and Row Major for Matrix C
+// The code section below describes matrix layout of input and output matrices.
+// Column Major for Matrix A, B and C.
+//
+// Note this example only works for ColumnMajor output because
+//   1) we only have row major epilogue.
+//   2) we swap A and B if the output is column major then we can still use the
+//      row major epilogue.
+//   3) Mx1 bias vector becomes 1xM after the swapping/transposing.
+//   4) we can use the existing OutputIterator to load 1xM bias vector.
+
 using LayoutInputA = cutlass::layout::ColumnMajor;
 using LayoutInputB = cutlass::layout::ColumnMajor;
-using LayoutOutput = cutlass::layout::RowMajor;
+using LayoutOutput = cutlass::layout::ColumnMajor;
 
 // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
 using MMAOp = cutlass::arch::OpClassTensorOp;
@@ -73,17 +81,18 @@ using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSw
 
 // Define the epilogue operation as LinearCombinationRelu. This is approximately equal to
 //
-//    d_ij = max(0, alpha * sum_k(a_ik * b_kj) + beta * c_ij )
+//    d_ij = max(0, alpha * sum_k(a_ik * b_kj) + c_ij )
 //
 using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu<
-    ElementOutput,                                     // <- data type of output matrix
-    128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- this is the number of elements per
-                                                       // vectorized memory access. For half
-                                                       // precision, it's 8 elements. This becomes
-                                                       // the vector width of math instructions in
-                                                       // epilogue too
-    ElementAccumulator,                                // <- data type of accumulator
-    ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+    ElementOutput,                                        // <- data type of output matrix
+    128 / cutlass::sizeof_bits<ElementOutput>::value,     // <- this is the number of elements per
+                                                          // vectorized memory access. For half
+                                                          // precision, it's 8 elements. This becomes
+                                                          // the vector width of math instructions in
+                                                          // epilogue too
+    ElementAccumulator,                                   // <- data type of accumulator
+    ElementComputeEpilogue,                               // <- data type for alpha in linear combination function
+    cutlass::epilogue::thread::ScaleType::NoBetaScaling>; // <- alpha x C + bias
 
 // Number of pipelines you want to use
 constexpr int NumStages = 2;
@@ -160,9 +169,8 @@ int run() {
   tensor_d.sync_device();
   tensor_ref_d.sync_device();
 
-  // Initialize alpha and beta for dot product computation
+  // Initialize alpha for dot product computation
   ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
-  ElementComputeEpilogue beta = ElementComputeEpilogue(0);
 
   // Split K dimension into 1 partitions
   int split_k_slices = 1;
@@ -178,7 +186,7 @@ int run() {
                                         //    to project away the N dimension by setting the stride to zero.
 
     tensor_d.device_ref(),              // <- reference to matrix D on device
-    {alpha, beta},                      // <- tuple of alpha and beta
+    {alpha},                              // <- alpha
     split_k_slices};                    // <- k-dimension split factor
 
   // Using the arguments, query for extra workspace required for matrix multiplication computation
@@ -190,8 +198,12 @@ int run() {
   // Instantiate CUTLASS kernel depending on templates
   Gemm gemm_op;
 
+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
   // Initialize CUTLASS kernel with arguments and workspace pointer
-  cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+  status = gemm_op.initialize(arguments, workspace.get());
   CUTLASS_CHECK(status);
 
   // Launch initialized CUTLASS kernel
@@ -233,7 +245,7 @@ int run() {
     for (int j = 0; j < problem_size.n(); ++j) {
       tensor_ref_d.at({i, j}) = std::max(
         ElementOutput(0), 
-        ElementOutput(tensor_ref_d.at({i, j}) + beta * tensor_c_bias.at({i, 0}))
+        ElementOutput(tensor_ref_d.at({i, j}) + tensor_c_bias.at({i, 0}))
       );
     }
   }
diff --git a/examples/13_two_tensor_op_fusion/CMakeLists.txt b/examples/13_two_tensor_op_fusion/CMakeLists.txt
new file mode 100644
index 0000000000..220485b7b7
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/CMakeLists.txt
@@ -0,0 +1,45 @@
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  13_fused_two_gemms
+  fused_gemm.cu
+  )
+
+cutlass_example_add_executable(
+  13_fused_two_convs
+  fused_conv2d.cu
+  )
+
+
+target_include_directories(
+  13_fused_two_gemms
+  PRIVATE
+  .
+  )
+
+target_include_directories(
+  13_fused_two_convs
+  PRIVATE
+  .
+  )
+
diff --git a/examples/13_two_tensor_op_fusion/README.md b/examples/13_two_tensor_op_fusion/README.md
new file mode 100644
index 0000000000..d89d876a0c
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/README.md
@@ -0,0 +1,76 @@
+# Introduction
+
+This example shows fusing two back-to-back GEMMs/Convolutions into one kernel. 
+
+<p align="center"><img src=/media/images/13_example_fusion.png></p>
+
+When running two unfused GEMM/Conv operations, each operation loads one input 
+activation matrix, one weight matrix (or filter matrix) from the memory and then 
+stores the result activation matrix back to the memory.
+
+When the two GEMM/Conv operations are fused together, the mainloops of the two
+GEMMs/Convs run back to back in a single kernel. The output accumulator of the
+1st GEMM/Conv will be stored in the register file and reused as the activation
+input of the 2nd GEMM/Conv. This saves a round trip to memory for the activation
+matrix.
+
+
+This example computes the following:
+- 1st GEMM/Conv: D0 = relu(alpha0 .\* A0 \*\* B0)
+- 2nd GEMM/Conv: D1 = relu(alpha1 .\* D0 \*\* B1 + beta1 .\* C1)
+
+In the above equation, operator \*\* can be matrix multiplication or convolution operation.
+
+# Implementation Details
+
+In order to run two GEMM/Convs in a single kernel, the example requires the same number of
+threadblocks are used across 2 GEMMs/Convs. This also ensures the same threadblock tile M across
+2 GEMMs/Convs.
+
+In order to reuse the output accumulator (stored in register-file) of the 1st GEMM as the 
+input activation, the example enforces the following two constraints:
+
+- thread_block_tile_N = problem_N 
+
+<p align="center"><img src=/media/images/13_example_block_resident_fusion.png></p>
+
+This constraint ensures that each threadblock loads the entire weight/filter matrix in
+addition to its own input activation tile. Therefore the input activation tile of the
+2nd GEMM/Conv only depends on the output activation tile of the 1st GEMM/Conv, and the
+operation can be fully block-resident.
+
+- warp_tile_N = thread_block_tile_N 
+
+<p align="center"><img src=/media/images/13_example_rf_resident_fusion.png></p>
+
+This constraint ensures that each warp loads the entire weight/filter kBlock in
+addition to its own input activation tile. Therefore the input activation warp tile of the
+2nd GEMM/Conv only depends on the output warp accumulator of the 1st GEMM/Conv in the
+register file, and the operation can be fully register-file-resident.
+
+# Copyright
+
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+```
+  Redistribution and use in source and binary forms, with or without modification, are permitted
+  provided that the following conditions are met:
+      * Redistributions of source code must retain the above copyright notice, this list of
+        conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright notice, this list of
+        conditions and the following disclaimer in the documentation and/or other materials
+        provided with the distribution.
+      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+        to endorse or promote products derived from this software without specific prior written
+        permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
+
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h
new file mode 100644
index 0000000000..305d18297c
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h
@@ -0,0 +1,368 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_conv2d_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_0 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_1 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+
+void run_nonfused_conv2d_fprop_f16_sm75() {
+
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back FP16 Analytic Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_f16_sm75() {
+
+  using ElementA           = cutlass::half_t; 
+  using ElementB           = cutlass::half_t; 
+  using ElementC           = cutlass::half_t; 
+  using ElementAccumulator = cutlass::half_t; 
+  using ElementCompute     = cutlass::half_t; 
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back FP16 Analytic Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
+
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_optimized_f16_sm75() {
+
+  using ElementA           = cutlass::half_t; 
+  using ElementB           = cutlass::half_t; 
+  using ElementC           = cutlass::half_t; 
+  using ElementAccumulator = cutlass::half_t; 
+  using ElementCompute     = cutlass::half_t; 
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h
new file mode 100644
index 0000000000..e14134e944
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h
@@ -0,0 +1,363 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_conv2d_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_0 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_1 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+
+void run_nonfused_conv2d_fprop_f16_sm80() {
+
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back FP16 Analytic Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_f16_sm80() {
+
+  using ElementA           = cutlass::half_t; 
+  using ElementB           = cutlass::half_t; 
+  using ElementC           = cutlass::half_t; 
+  using ElementAccumulator = cutlass::half_t; 
+  using ElementCompute     = cutlass::half_t; 
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back FP16 Analytic Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
+
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_optimized_f16_sm80() {
+
+  using ElementA           = cutlass::half_t; 
+  using ElementB           = cutlass::half_t; 
+  using ElementC           = cutlass::half_t; 
+  using ElementAccumulator = cutlass::half_t; 
+  using ElementCompute     = cutlass::half_t; 
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h
new file mode 100644
index 0000000000..2cb4ac2e80
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h
@@ -0,0 +1,367 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_interleaved_conv2d_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_1 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+
+void run_nonfused_conv2d_fprop_s8_sm75() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_s8_sm75() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_optimized_s8_sm75() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h
new file mode 100644
index 0000000000..c73d6c69b4
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h
@@ -0,0 +1,368 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_interleaved_conv2d_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_0 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_1 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+
+void run_nonfused_conv2d_fprop_s8_sm80() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_s8_sm80() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      8 * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_optimized_s8_sm80() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      8 * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
new file mode 100644
index 0000000000..07e3a0dfc2
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
@@ -0,0 +1,628 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/reference/device/tensor_relu.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+#define CHECK_GT(val1, val2) \
+    if((val1) <= (val2)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
+#define CHECK_TRUE(val) \
+    if(!(val)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
+
+
+template <typename Conv2d0_, typename Conv2d1_>
+class B2bNonFusedConv2dRun {
+public:
+
+  using Conv2d0 = Conv2d0_;
+  using Conv2d1 = Conv2d1_;
+  using ElementAccumulator = typename Conv2d0::ElementAccumulator;
+  using ElementCompute = typename Conv2d0::ElementCompute;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d0::kConvolutionalOperator;
+  static_assert(kConvolutionalOperator == Conv2d1::kConvolutionalOperator, 
+        "Fused convolution operators must be the same");
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Conv2d0::ElementA, typename Conv2d0::LayoutA> tensor_A0;
+  cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_C0;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_computed;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_reference;
+
+  cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_C1;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_computed;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_reference;
+
+
+public:
+
+  B2bNonFusedConv2dRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 16) {
+        scope = 2;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
+        
+    tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+
+    initialize_tensor(tensor_A0.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); 
+    initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_C0.sync_device();
+    tensor_D0_computed.sync_device();
+    tensor_D0_reference.sync_device();
+    tensor_B1.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1_computed.sync_device();
+    tensor_D1_reference.sync_device();
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha0 = ElementCompute(1),
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1),
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
+
+    initialize(problem_size_0, problem_size_1);
+
+    // configure the operator
+    Conv2d0 conv2d_op_0;
+    Conv2d1 conv2d_op_1;
+
+    typename Conv2d0::Arguments conv2d_args_0(
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_computed.device_ref(),
+      {alpha0, beta0},
+      split_k_mode
+    );
+    typename Conv2d1::Arguments conv2d_args_1(
+      problem_size_1,
+      tensor_D0_computed.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_computed.device_ref(),
+      {alpha1, beta1},
+      split_k_mode
+    );
+
+
+    cutlass::Status status = conv2d_op_0.initialize(conv2d_args_0);
+
+    CUTLASS_CHECK(status);
+
+    status = conv2d_op_1.initialize(conv2d_args_1);
+
+    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = conv2d_op_0();
+        CUTLASS_CHECK(status);
+        status = conv2d_op_1();
+        CUTLASS_CHECK(status);
+    }
+
+    //
+    // Run Conv2d
+    //
+    cudaEvent_t start, stop1, stop2;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop1);
+    cudaEventCreate(&stop2);
+
+    cudaEventRecord(start);
+
+
+    for(int i = 0; i < runs; i++) {
+        // run conv2d operator
+        status = conv2d_op_0();
+        CUTLASS_CHECK(status);
+    }
+    cudaEventRecord(stop1);    
+    
+    for(int i = 0; i < runs; i++) {
+        // run conv2d operator
+        status = conv2d_op_1();
+        CUTLASS_CHECK(status);
+    }
+    cudaEventRecord(stop2);
+    cudaDeviceSynchronize();
+    float conv2d0Time, conv2d1Time, totalTime;
+    cudaEventElapsedTime(&conv2d0Time, start, stop1);
+    cudaEventElapsedTime(&conv2d1Time, stop1, stop2);
+    cudaEventElapsedTime(&totalTime, start, stop2);
+    std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n";
+    std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n";
+    std::cout << "total time " << totalTime / (float)runs << " ms\n";
+
+    tensor_D0_computed.sync_host();
+    tensor_D1_computed.sync_host();
+    
+    bool passed = false;
+
+    cutlass::reference::device::Conv2d<
+      typename Conv2d0::ElementA,
+      typename Conv2d0::LayoutA,
+      typename Conv2d0::ElementB,
+      typename Conv2d0::LayoutB,
+      typename Conv2d0::ElementC,
+      typename Conv2d0::LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_reference.device_ref(),
+      alpha0, 
+      beta0);
+    
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); 
+    }
+
+    cutlass::reference::device::Conv2d<
+      typename Conv2d1::ElementA,
+      typename Conv2d1::LayoutA,
+      typename Conv2d1::ElementB,
+      typename Conv2d1::LayoutB,
+      typename Conv2d1::ElementC,
+      typename Conv2d1::LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size_1,
+      tensor_D0_reference.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_reference.device_ref(),
+      alpha1, 
+      beta1);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); 
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    CHECK_TRUE(result == cudaSuccess);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D0_reference.sync_host();
+    tensor_D1_reference.sync_host();
+    
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D1_computed.host_view(), 
+      tensor_D1_reference.host_view());
+
+    CHECK_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_B2bImplicitGemm_device_nonfused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream results(fname.str());
+
+      results << problem_size_0 << std::endl;
+      results << problem_size_1 << std::endl;
+
+      results
+        << "\nA0:\n" << tensor_A0.host_view() << "\n"
+        << "\nB0:\n" << tensor_B0.host_view() << "\n"
+        << "\nC0:\n" << tensor_C0.host_view() << "\n"
+        << "\nD0 reference:\n" << tensor_D0_reference.host_view() << "\n"
+        << "\nD0 computed:\n" << tensor_D0_computed.host_view() << "\n"
+        << "\nB1:\n" << tensor_B1.host_view() << "\n"
+        << "\nC1:\n" << tensor_C1.host_view() << "\n"
+        << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
+        << "\nD1 computed:\n" << tensor_D1_computed.host_view();
+
+
+    }
+
+    return passed;
+  }
+
+};
+
+template <typename B2bConv2d_>
+class B2bFusedConv2dRun {
+public:
+
+  using B2bConv2d = B2bConv2d_;
+  using ElementAccumulator = typename B2bConv2d::ElementAccumulator;
+  using ElementCompute = typename B2bConv2d::ElementCompute;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = B2bConv2d::kConvolutionalOperator;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename B2bConv2d::ElementA, typename B2bConv2d::LayoutA> tensor_A0;
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C0;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D0_reference;
+
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C1;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_computed;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_reference;
+
+
+public:
+
+  B2bFusedConv2dRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 16) {
+        scope = 2;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
+        
+    tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+
+    initialize_tensor(tensor_A0.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); 
+    initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_C0.sync_device();
+    tensor_D0_reference.sync_device();
+    tensor_B1.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1_computed.sync_device();
+    tensor_D1_reference.sync_device();
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha0 = ElementCompute(1),
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1),
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
+
+    initialize(problem_size_0, problem_size_1);
+
+    // configure the operator
+    B2bConv2d b2b_conv2d_op;
+
+    typename B2bConv2d::Arguments b2b_conv2d_args(
+      problem_size_0,
+      problem_size_1,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_computed.device_ref(),
+      {alpha0, beta0},
+      {alpha1, beta1},
+      split_k_mode
+    );
+
+    cutlass::Status status = b2b_conv2d_op.initialize(b2b_conv2d_args);
+
+    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = b2b_conv2d_op();
+        CUTLASS_CHECK(status);
+    }
+
+    //
+    // Run the Conv2d
+    //
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+
+    for(int i = 0; i < runs; i++) {
+
+        // run conv2d operator
+        status = b2b_conv2d_op();
+        CUTLASS_CHECK(status);
+    }
+    
+    cudaEventRecord(stop);
+    cudaDeviceSynchronize();
+    float conv2dTime;
+    cudaEventElapsedTime(&conv2dTime, start, stop);
+    std::cout << "time " << conv2dTime / (float)runs << " ms\n";
+
+    tensor_D1_computed.sync_host();
+    
+    bool passed = false;
+
+    cutlass::reference::device::Conv2d<
+      typename B2bConv2d::ElementA,
+      typename B2bConv2d::LayoutA,
+      typename B2bConv2d::ElementB,
+      typename B2bConv2d::LayoutB,
+      typename B2bConv2d::ElementC,
+      typename B2bConv2d::LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_reference.device_ref(),
+      alpha0, 
+      beta0);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); 
+    }
+
+    cutlass::reference::device::Conv2d<
+      typename B2bConv2d::ElementA,
+      typename B2bConv2d::LayoutA,
+      typename B2bConv2d::ElementB,
+      typename B2bConv2d::LayoutB,
+      typename B2bConv2d::ElementC,
+      typename B2bConv2d::LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size_1,
+      tensor_D0_reference.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_reference.device_ref(),
+      alpha1, 
+      beta1);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); 
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    CHECK_TRUE(result == cudaSuccess);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D0_reference.sync_host();
+    tensor_D1_reference.sync_host();
+    
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D1_computed.host_view(), 
+      tensor_D1_reference.host_view());
+
+    CHECK_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_B2bImplicitGemm_device_fused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream results(fname.str());
+
+      results << problem_size_0 << std::endl;
+      results << problem_size_1 << std::endl;
+
+      results
+        << "\nA0:\n" << tensor_A0.host_view() << "\n"
+        << "\nB0:\n" << tensor_B0.host_view() << "\n"
+        << "\nC0:\n" << tensor_C0.host_view() << "\n"
+        << "\nB1:\n" << tensor_B1.host_view() << "\n"
+        << "\nC1:\n" << tensor_C1.host_view() << "\n"
+        << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
+        << "\nD1 computed:\n" << tensor_D1_computed.host_view();
+
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
similarity index 92%
rename from examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
rename to examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
index 10a0d4bf94..50da709e73 100644
--- a/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -43,14 +43,15 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
+cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*1600, 64, 576);
+cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*1600, 128, 64);
+
 void run_nonfused_gemm_f16() {
 
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
   using ElementCompute = cutlass::half_t;
 
-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
   ElementCompute alpha0 = ElementCompute(2);
   ElementCompute beta0 = ElementCompute(0);
   ElementCompute alpha1 = ElementCompute(2);
@@ -110,7 +111,7 @@ void run_nonfused_gemm_f16() {
   B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
 
   std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
-  bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool pass = nonFusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
   if(pass)
     std::cout << "Pass\n";
   else
@@ -123,8 +124,6 @@ void run_fused_gemm_f16() {
   using ElementAccumulator = cutlass::half_t;
   using ElementCompute = cutlass::half_t;
 
-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
   ElementCompute alpha0 = ElementCompute(2);
   ElementCompute beta0 = ElementCompute(0);
   ElementCompute alpha1 = ElementCompute(2);
@@ -178,7 +177,7 @@ void run_fused_gemm_f16() {
   B2bFusedGemmRun<B2bGemm> fusedGemm;
 
   std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n";
-  bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool passed = fusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
   if(passed)
     std::cout << "Pass\n";
   else
diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h
new file mode 100644
index 0000000000..749ece2b22
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h
@@ -0,0 +1,189 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "device/b2b_gemm.h"
+#include "b2b_gemm_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*1600, 64, 576);
+cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*1600, 128, 64);
+
+void run_nonfused_gemm_f16_sm80() {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using Gemm0 = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3
+  >;
+  using Gemm1 = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3
+  >;
+
+  B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
+
+  std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
+  bool pass = nonFusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+}
+
+void run_fused_gemm_f16_sm80() {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bGemm = cutlass::gemm::device::B2bGemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3
+  >;
+
+  B2bFusedGemmRun<B2bGemm> fusedGemm;
+
+  std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n";
+  bool passed = fusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  //#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/examples/13_fused_two_gemms/b2b_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
similarity index 95%
rename from examples/13_fused_two_gemms/b2b_gemm_run.h
rename to examples/13_two_tensor_op_fusion/b2b_gemm_run.h
index 053064d751..8143f3d21a 100644
--- a/examples/13_fused_two_gemms/b2b_gemm_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -121,7 +121,9 @@ struct B2bNonFusedGemmRun
     ElementCompute beta0 = ElementCompute(0),
     ElementCompute alpha1 = ElementCompute(1), 
     ElementCompute beta1 = ElementCompute(0),
-    bool relu = true) {
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
     
     //
     // Allocate the GEMM workspace
@@ -222,6 +224,14 @@ struct B2bNonFusedGemmRun
     status = gemm_op_1.initialize(arguments_1);
 
     CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = gemm_op_0();
+        CUTLASS_CHECK(status);
+        status = gemm_op_1();
+        CUTLASS_CHECK(status);
+    }
+
     //
     // Run the GEMM
     //
@@ -233,13 +243,13 @@ struct B2bNonFusedGemmRun
 
     cudaEventRecord(start);
 
-    for(int i = 0; i < 100; i++) {
+    for(int i = 0; i < runs; i++) {
         status = gemm_op_0();
     
         CUTLASS_CHECK(status);
     }
     cudaEventRecord(stop1);
-    for(int i = 0; i < 100; i++) {
+    for(int i = 0; i < runs; i++) {
     
         status = gemm_op_1();
     
@@ -252,9 +262,9 @@ struct B2bNonFusedGemmRun
     cudaEventElapsedTime(&gemm0Time, start, stop1);
     cudaEventElapsedTime(&gemm1Time, stop1, stop2);
     cudaEventElapsedTime(&totalTime, start, stop2);
-    std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n";
-    std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n";
-    std::cout << "total time " << totalTime / 100.0 << " ms\n";
+    std::cout << "gemm 0 time " << gemm0Time / (float)runs << " ms\n";
+    std::cout << "gemm 1 time " << gemm1Time / (float)runs << " ms\n";
+    std::cout << "total time " << totalTime / (float)runs << " ms\n";
 
     tensor_D0.sync_host();
     tensor_D1.sync_host();
@@ -415,7 +425,9 @@ struct B2bFusedGemmRun
     ElementCompute beta0 = ElementCompute(0),
     ElementCompute alpha1 = ElementCompute(1), 
     ElementCompute beta1 = ElementCompute(0),
-    bool relu = true) {
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
     
     //
     // Allocate the GEMM workspace
@@ -433,10 +445,6 @@ struct B2bFusedGemmRun
       typename B2bGemm::ElementC, 
       typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());
 
-//    cutlass::HostTensor<
-//      typename B2bGemm::ElementC, 
-//      typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn());
-
     cutlass::HostTensor<
       typename B2bGemm::ElementC, 
       typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn());
@@ -503,6 +511,11 @@ struct B2bFusedGemmRun
 
     CUTLASS_CHECK(status);
 
+    for(int i = 0; i < warm_ups; i++) {
+        status = b2b_gemm_op();
+        CUTLASS_CHECK(status);
+    }
+
     //
     // Run the GEMM
     //
@@ -513,7 +526,7 @@ struct B2bFusedGemmRun
 
     cudaEventRecord(start);
 
-    for(int i = 0; i < 100; i++) {
+    for(int i = 0; i < runs; i++) {
         status = b2b_gemm_op();
 
         CUTLASS_CHECK(status);
@@ -523,9 +536,8 @@ struct B2bFusedGemmRun
     cudaDeviceSynchronize();
     float gemmTime;
     cudaEventElapsedTime(&gemmTime, start, stop);
-    std::cout << "time " << gemmTime / 100.0 << " ms\n";
+    std::cout << "time " << gemmTime / (float)runs << " ms\n";
 
-    //tensor_D0.sync_host();
     tensor_D1.sync_host();
 
     //
@@ -593,7 +605,6 @@ struct B2bFusedGemmRun
         << "A0 =\n" << tensor_A0.host_view()
         << "\nB0 =\n" << tensor_B0.host_view()
         << "\nC0 =\n" << tensor_C0.host_view()
-//        << "\nD0 =\n" << tensor_D0.host_view()
         << "\nB1 =\n" << tensor_B1.host_view()
         << "\nC1 =\n" << tensor_C1.host_view()
         << "\n\nReference =\n" << reference_D1.host_view()
diff --git a/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
similarity index 92%
rename from examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
rename to examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
index 1c3f15c2cf..2c2610b7d4 100644
--- a/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -43,14 +43,15 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
+cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*1600, 64, 576);
+cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*1600, 128, 64);
+
 void run_nonfused_gemm_s8() {
 
   using ElementOutput = int8_t;
   using ElementAccumulator = int32_t;
   using ElementCompute = float;
 
-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
   ElementCompute alpha0 = ElementCompute(2);
   ElementCompute beta0 = ElementCompute(0);
   ElementCompute alpha1 = ElementCompute(2);
@@ -110,7 +111,7 @@ void run_nonfused_gemm_s8() {
   B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;
 
   std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
-  bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool pass = nonFusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
   if(pass)
     std::cout << "Pass\n";
   else
@@ -123,8 +124,6 @@ void run_fused_gemm_s8() {
   using ElementAccumulator = int32_t;
   using ElementCompute = float;
 
-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
   ElementCompute alpha0 = ElementCompute(2);
   ElementCompute beta0 = ElementCompute(0);
   ElementCompute alpha1 = ElementCompute(2);
@@ -178,7 +177,7 @@ void run_fused_gemm_s8() {
   B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
 
   std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n";
-  bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool passed = fusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
   if(passed)
     std::cout << "Pass\n";
   else
diff --git a/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h
similarity index 91%
rename from examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h
rename to examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h
index 32b77128e8..8b9eefc604 100644
--- a/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -43,14 +43,15 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
+cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*1600, 64, 576);
+cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*1600, 128, 64);
+
 void run_nonfused_gemm_s8_sm80() {
 
   using ElementOutput = int8_t;
   using ElementAccumulator = int32_t;
   using ElementCompute = float;
 
-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
   ElementCompute alpha0 = ElementCompute(2);
   ElementCompute beta0 = ElementCompute(0);
   ElementCompute alpha1 = ElementCompute(2);
@@ -86,8 +87,7 @@ void run_nonfused_gemm_s8_sm80() {
     16,
     16,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
   using Gemm1 = cutlass::gemm::device::Gemm<
     int8_t,
@@ -113,14 +113,13 @@ void run_nonfused_gemm_s8_sm80() {
     16,
     16,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;
 
   std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
-  bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool pass = nonFusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
   if(pass)
     std::cout << "Pass\n";
   else
@@ -133,8 +132,6 @@ void run_fused_gemm_s8_sm80() {
   using ElementAccumulator = int32_t;
   using ElementCompute = float;
 
-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
   ElementCompute alpha0 = ElementCompute(2);
   ElementCompute beta0 = ElementCompute(0);
   ElementCompute alpha1 = ElementCompute(2);
@@ -193,7 +190,7 @@ void run_fused_gemm_s8_sm80() {
   B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
 
   std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n";
-  bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool passed = fusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
   if(passed)
     std::cout << "Pass\n";
   else
diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
new file mode 100644
index 0000000000..1b0795fa41
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
@@ -0,0 +1,661 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/host_reorder.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/reference/device/tensor_relu.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+#define CHECK_GT(val1, val2) \
+    if((val1) <= (val2)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
+#define CHECK_TRUE(val) \
+    if(!(val)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
+
+
+template <typename Conv2d0_, typename Conv2d1_, int InterleavedK>
+class B2bInterleavedNonFusedConv2dRun {
+public:
+
+  using Conv2d0 = Conv2d0_;
+  using Conv2d1 = Conv2d1_;
+  using ElementAccumulator = typename Conv2d0::ElementAccumulator;
+  using ElementCompute = typename Conv2d0::ElementCompute;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d0::kConvolutionalOperator;
+  static_assert(kConvolutionalOperator == Conv2d1::kConvolutionalOperator, 
+        "Fused convolution operators must be the same");
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Conv2d0::ElementA, typename Conv2d0::LayoutA> tensor_A0;
+  cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0;
+  cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0_reordered;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_C0;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_computed;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_reference;
+
+  cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1;
+  cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1_reordered;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_C1;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_computed;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_reference;
+
+
+public:
+
+  B2bInterleavedNonFusedConv2dRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 16) {
+        scope = 2;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
+        
+    tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_B1_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+
+    initialize_tensor(tensor_A0.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); 
+    initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
+
+    //Reorder B0 and B1
+    cutlass::reorder_convK<InterleavedK, InterleavedK>(
+        tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_0));
+    cutlass::reorder_convK<InterleavedK, InterleavedK>(
+        tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_1));
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_B0_reordered.sync_device();
+    tensor_C0.sync_device();
+    tensor_D0_computed.sync_device();
+    tensor_D0_reference.sync_device();
+    tensor_B1.sync_device();
+    tensor_B1_reordered.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1_computed.sync_device();
+    tensor_D1_reference.sync_device();
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha0 = ElementCompute(1),
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1),
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
+
+    initialize(problem_size_0, problem_size_1);
+
+    // configure the operator
+    Conv2d0 conv2d_op_0;
+    Conv2d1 conv2d_op_1;
+
+    typename Conv2d0::Arguments conv2d_args_0(
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0_reordered.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_computed.device_ref(),
+      {alpha0, beta0},
+      split_k_mode
+    );
+    typename Conv2d1::Arguments conv2d_args_1(
+      problem_size_1,
+      tensor_D0_computed.device_ref(),
+      tensor_B1_reordered.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_computed.device_ref(),
+      {alpha1, beta1},
+      split_k_mode
+    );
+
+
+    cutlass::Status status = conv2d_op_0.initialize(conv2d_args_0);
+
+    CUTLASS_CHECK(status);
+
+    status = conv2d_op_1.initialize(conv2d_args_1);
+
+    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = conv2d_op_0();
+        CUTLASS_CHECK(status);
+        status = conv2d_op_1();
+        CUTLASS_CHECK(status);
+    }
+
+    //
+    // Run Conv2d
+    //
+    cudaEvent_t start, stop1, stop2;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop1);
+    cudaEventCreate(&stop2);
+
+    cudaEventRecord(start);
+
+
+    for(int i = 0; i < runs; i++) {
+        // run conv2d operator
+        status = conv2d_op_0();
+        CUTLASS_CHECK(status);
+    }
+    cudaEventRecord(stop1);    
+    
+    for(int i = 0; i < runs; i++) {
+        // run conv2d operator
+        status = conv2d_op_1();
+        CUTLASS_CHECK(status);
+    }
+    cudaEventRecord(stop2);
+    cudaDeviceSynchronize();
+    float conv2d0Time, conv2d1Time, totalTime;
+    cudaEventElapsedTime(&conv2d0Time, start, stop1);
+    cudaEventElapsedTime(&conv2d1Time, stop1, stop2);
+    cudaEventElapsedTime(&totalTime, start, stop2);
+    std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n";
+    std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n";
+    std::cout << "total time " << totalTime / (float)runs << " ms\n";
+
+    tensor_D0_computed.sync_host();
+    tensor_D1_computed.sync_host();
+    
+    bool passed = false;
+
+    cutlass::reference::device::Conv2d<
+      typename Conv2d0::ElementA,
+      typename Conv2d0::LayoutA,
+      typename Conv2d0::ElementB,
+      typename Conv2d0::LayoutB,
+      typename Conv2d0::ElementC,
+      typename Conv2d0::LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<typename Conv2d0::ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_reference.device_ref(),
+      alpha0, 
+      beta0);
+    
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); 
+    }
+
+    cutlass::reference::device::Conv2d<
+      typename Conv2d1::ElementA,
+      typename Conv2d1::LayoutA,
+      typename Conv2d1::ElementB,
+      typename Conv2d1::LayoutB,
+      typename Conv2d1::ElementC,
+      typename Conv2d1::LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<typename Conv2d1::ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size_1,
+      tensor_D0_reference.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_reference.device_ref(),
+      alpha1, 
+      beta1);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); 
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    CHECK_TRUE(result == cudaSuccess);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D0_reference.sync_host();
+    tensor_D1_reference.sync_host();
+    
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D1_computed.host_view(), 
+      tensor_D1_reference.host_view());
+
+    CHECK_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_B2bImplicitGemm_device_interleaved_nonfused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream results(fname.str());
+
+      results << problem_size_0 << std::endl;
+      results << problem_size_1 << std::endl;
+
+      results
+        << "\nA0:\n" << tensor_A0.host_view() << "\n"
+        << "\nB0:\n" << tensor_B0.host_view() << "\n"
+        << "\nB0_reordered:\n" << tensor_B0_reordered.host_view() << "\n"
+        << "\nC0:\n" << tensor_C0.host_view() << "\n"
+        << "\nD0 reference:\n" << tensor_D0_reference.host_view() << "\n"
+        << "\nD0 computed:\n" << tensor_D0_computed.host_view() << "\n"
+        << "\nB1:\n" << tensor_B1.host_view() << "\n"
+        << "\nB1_reordered:\n" << tensor_B1_reordered.host_view() << "\n"
+        << "\nC1:\n" << tensor_C1.host_view() << "\n"
+        << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
+        << "\nD1 computed:\n" << tensor_D1_computed.host_view();
+
+
+    }
+
+    return passed;
+  }
+
+};
+
+template <typename B2bConv2d_, int InterleavedK>
+class B2bInterleavedFusedConv2dRun {
+public:
+
+  using B2bConv2d = B2bConv2d_;
+  using ElementAccumulator = typename B2bConv2d::ElementAccumulator;
+  using ElementCompute = typename B2bConv2d::ElementCompute;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = B2bConv2d::kConvolutionalOperator;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename B2bConv2d::ElementA, typename B2bConv2d::LayoutA> tensor_A0;
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0;
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0_reordered;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C0;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D0_reference;
+
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1;
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1_reordered;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C1;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_computed;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_reference;
+
+
+public:
+
+  B2bInterleavedFusedConv2dRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 16) {
+        scope = 2;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
+        
+    tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_B1_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+
+    initialize_tensor(tensor_A0.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); 
+    initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
+
+    //Reorder B0 and B1
+    cutlass::reorder_convK<16, InterleavedK>(
+        tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_0));
+    cutlass::reorder_convK<InterleavedK, InterleavedK>(
+        tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_1));
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_B0_reordered.sync_device();
+    tensor_C0.sync_device();
+    tensor_D0_reference.sync_device();
+    tensor_B1.sync_device();
+    tensor_B1_reordered.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1_computed.sync_device();
+    tensor_D1_reference.sync_device();
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha0 = ElementCompute(1),
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1),
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
+
+    initialize(problem_size_0, problem_size_1);
+
+    // configure the operator
+    B2bConv2d b2b_conv2d_op;
+
+    typename B2bConv2d::Arguments b2b_conv2d_args(
+      problem_size_0,
+      problem_size_1,
+      tensor_A0.device_ref(),
+      tensor_B0_reordered.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_B1_reordered.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_computed.device_ref(),
+      {alpha0, beta0},
+      {alpha1, beta1},
+      split_k_mode
+    );
+
+    cutlass::Status status = b2b_conv2d_op.initialize(b2b_conv2d_args);
+
+    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = b2b_conv2d_op();
+        CUTLASS_CHECK(status);
+    }
+
+    //
+    // Run the Conv2d
+    //
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+
+    for(int i = 0; i < runs; i++) {
+
+        // run conv2d operator
+        status = b2b_conv2d_op();
+        CUTLASS_CHECK(status);
+    }
+    
+    cudaEventRecord(stop);
+    cudaDeviceSynchronize();
+    float conv2dTime;
+    cudaEventElapsedTime(&conv2dTime, start, stop);
+    std::cout << "time " << conv2dTime / (float)runs << " ms\n";
+
+    tensor_D1_computed.sync_host();
+    
+    bool passed = false;
+
+    cutlass::reference::device::Conv2d<
+      typename B2bConv2d::ElementA,
+      typename B2bConv2d::LayoutA,
+      typename B2bConv2d::ElementB,
+      typename B2bConv2d::LayoutB,
+      typename B2bConv2d::ElementC,
+      typename B2bConv2d::LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<typename B2bConv2d::ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_reference.device_ref(),
+      alpha0, 
+      beta0);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); 
+    }
+
+    cutlass::reference::device::Conv2d<
+      typename B2bConv2d::ElementA,
+      typename B2bConv2d::LayoutA,
+      typename B2bConv2d::ElementB,
+      typename B2bConv2d::LayoutB,
+      typename B2bConv2d::ElementC,
+      typename B2bConv2d::LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<typename B2bConv2d::ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size_1,
+      tensor_D0_reference.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_reference.device_ref(),
+      alpha1, 
+      beta1);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); 
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    CHECK_TRUE(result == cudaSuccess);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D0_reference.sync_host();
+    tensor_D1_reference.sync_host();
+    
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D1_computed.host_view(), 
+      tensor_D1_reference.host_view());
+
+    CHECK_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_B2bImplicitGemm_device_interleaved_fused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream results(fname.str());
+
+      results << problem_size_0 << std::endl;
+      results << problem_size_1 << std::endl;
+
+      results
+        << "\nA0:\n" << tensor_A0.host_view() << "\n"
+        << "\nB0:\n" << tensor_B0.host_view() << "\n"
+        << "\nB0_reordered:\n" << tensor_B0_reordered.host_view() << "\n"
+        << "\nC0:\n" << tensor_C0.host_view() << "\n"
+        << "\nB1:\n" << tensor_B1.host_view() << "\n"
+        << "\nB1_reordered:\n" << tensor_B1_reordered.host_view() << "\n"
+        << "\nC1:\n" << tensor_C1.host_view() << "\n"
+        << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
+        << "\nD1 computed:\n" << tensor_D1_computed.host_view();
+
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
similarity index 98%
rename from examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h
rename to examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
index e98be9e511..c33494095d 100644
--- a/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -243,6 +243,7 @@ struct B2bInterleavedNonFusedGemmRun
         status = gemm_op_1();
         CUTLASS_CHECK(status);
     }
+
     //
     // Run the GEMM
     //
@@ -455,10 +456,6 @@ struct B2bInterleavedFusedGemmRun
       typename B2bGemm::ElementC, 
       typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());
 
-//    cutlass::HostTensor<
-//      typename B2bGemm::ElementC, 
-//      typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn());
-
     cutlass::HostTensor<
       typename B2bGemm::ElementC, 
       typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn());
@@ -507,7 +504,6 @@ struct B2bInterleavedFusedGemmRun
     tensor_B0.sync_device();
     tensor_B0_reordered.sync_device();
     tensor_C0.sync_device();
-    //tensor_D0.sync_device();
     tensor_B1.sync_device();
     tensor_B1_reordered.sync_device();
     tensor_C1.sync_device();
@@ -566,7 +562,6 @@ struct B2bInterleavedFusedGemmRun
     cudaEventElapsedTime(&gemmTime, start, stop);
     std::cout << "time " << gemmTime / (float)runs << " ms\n";
 
-    //tensor_D0.sync_host();
     tensor_D1.sync_host();
 
     //
@@ -635,7 +630,6 @@ struct B2bInterleavedFusedGemmRun
         << "\nB0 =\n" << tensor_B0.host_view()
         << "\nB0_reordered =\n" << tensor_B0_reordered.host_view()
         << "\nC0 =\n" << tensor_C0.host_view()
-//        << "\nD0 =\n" << tensor_D0.host_view()
         << "\nB1 =\n" << tensor_B1.host_view()
         << "\nB1_reordered =\n" << tensor_B1_reordered.host_view()
         << "\nC1 =\n" << tensor_C1.host_view()
diff --git a/examples/13_fused_two_gemms/device/b2b_gemm.h b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
similarity index 99%
rename from examples/13_fused_two_gemms/device/b2b_gemm.h
rename to examples/13_two_tensor_op_fusion/device/b2b_gemm.h
index 3f161435dd..b72ac2918f 100644
--- a/examples/13_fused_two_gemms/device/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
new file mode 100644
index 0000000000..64f97b7b39
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
@@ -0,0 +1,274 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level Implicit GEMM
+*/
+
+#pragma once
+
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+
+#include "kernel/b2b_implicit_gemm_convolution.h"
+#include "kernel/default_b2b_conv2d_fprop.h"
+
+namespace cutlass {
+namespace conv {
+namespace device {
+
+template<typename B2bImplicitGemmKernel_>
+class B2bImplicitGemmConvolution {
+public:
+
+  using B2bImplicitGemmKernel = B2bImplicitGemmKernel_;
+
+  using ElementA = typename B2bImplicitGemmKernel::ElementA;
+  using LayoutA = typename B2bImplicitGemmKernel::LayoutA;
+  using ElementB = typename B2bImplicitGemmKernel::ElementB;
+  using LayoutB = typename B2bImplicitGemmKernel::LayoutB;
+  using ElementC = typename B2bImplicitGemmKernel::ElementC;
+  using LayoutC = typename B2bImplicitGemmKernel::LayoutC;
+  using ElementAccumulator = typename B2bImplicitGemmKernel::ElementAccumulator;
+  using ElementCompute = typename B2bImplicitGemmKernel::ElementCompute;
+  using OperatorClass = typename B2bImplicitGemmKernel::OperatorClass;
+  using ArchTag = typename B2bImplicitGemmKernel::ArchTag;
+  using ThreadblockShape0 = typename B2bImplicitGemmKernel::ThreadblockShape0;
+  using ThreadblockShape1 = typename B2bImplicitGemmKernel::ThreadblockShape1;
+  using WarpShape0 = typename B2bImplicitGemmKernel::WarpShape0;
+  using WarpShape1 = typename B2bImplicitGemmKernel::WarpShape1;
+  using InstructionShape = typename B2bImplicitGemmKernel::InstructionShape;
+  using ThreadblockSwizzle = typename B2bImplicitGemmKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp0 = typename B2bImplicitGemmKernel::EpilogueOutputOp0;
+  using EpilogueOutputOp1 = typename B2bImplicitGemmKernel::EpilogueOutputOp1;
+  static int const kStages = B2bImplicitGemmKernel::kStages;
+  static int const kConvDim = B2bImplicitGemmKernel::kConvDim;
+  using WarpMmaOperator0 = typename B2bImplicitGemmKernel::WarpMmaOperator0;
+  using WarpMmaOperator1 = typename B2bImplicitGemmKernel::WarpMmaOperator1;
+  using ArchMmaOperator = typename B2bImplicitGemmKernel::ArchMmaOperator;
+  using MathOperator = typename B2bImplicitGemmKernel::MathOperator; 
+
+  static cutlass::conv::Operator const kConvolutionalOperator = B2bImplicitGemmKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = B2bImplicitGemmKernel::kIteratorAlgorithm;
+
+  static int const kWarpCount = 
+    (ThreadblockShape0::kM / WarpShape0::kM) * 
+    (ThreadblockShape0::kN / WarpShape0::kN);
+
+  /// Argument structure
+  using Arguments = typename B2bImplicitGemmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename B2bImplicitGemmKernel::Params params_;
+
+public:
+
+  /// Constructs Implicit GEMM
+  B2bImplicitGemmConvolution() { }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    // dispatch to iterators
+    Status status = B2bImplicitGemmKernel::B2bMma::IteratorA0::can_implement(args.problem_size_0);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = B2bImplicitGemmKernel::B2bMma::IteratorB0::can_implement(args.problem_size_0);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = B2bImplicitGemmKernel::B2bMma::IteratorB1::can_implement(args.problem_size_1);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0),
+        {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+        args.problem_size_0.split_k_slices));
+
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+  
+    size_t workspace_bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0),
+        {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+        args.problem_size_0.split_k_slices);
+
+    if(args.split_k_mode == SplitKMode::kParallel) {
+
+      // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
+      // The user needs to call a reduction operator to optain the final output tensor
+      workspace_bytes = 
+        sizeof(ElementAccumulator) *
+        size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size_0)) *
+        size_t(grid_tiled_shape.k());
+    }
+
+    else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size_0.split_k_slices > 1) {
+
+      // Split-K serial: The user workspace is used to store semaphore and serialize writing the 
+      // final reduced output to user's output tensor
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    return workspace_bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+   
+    if (args.problem_size_0.split_k_slices > 1) {
+
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+
+      cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
+
+      if (status != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    // initialize the params structure from the arguments
+    params_ = typename B2bImplicitGemmKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+    
+    int smem_size = int(sizeof(typename B2bImplicitGemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<B2bImplicitGemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+
+      result = cudaFuncSetAttribute(
+          cutlass::Kernel<B2bImplicitGemmKernel>,
+          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    
+    return Status::kSuccess;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.ptr_A0 = args.ref_A0.data();
+    params_.ptr_B0 = args.ref_B0.data();
+    params_.ptr_C0 = args.ref_C0.data();
+    params_.ptr_B1 = args.ref_B1.data();
+    params_.ptr_C1 = args.ref_C1.data();
+    params_.ptr_D1 = args.ref_D1.data();
+    params_.output_op_0 = args.output_op_0;
+    params_.output_op_1 = args.output_op_1;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+
+    int smem_size = int(sizeof(typename B2bImplicitGemmKernel::SharedStorage));
+
+    cutlass::Kernel<B2bImplicitGemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace conv
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/fused_conv2d.cu b/examples/13_two_tensor_op_fusion/fused_conv2d.cu
new file mode 100644
index 0000000000..f6bb3d7259
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/fused_conv2d.cu
@@ -0,0 +1,102 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h"
+#include "b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h"
+#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h"
+#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h"
+
+int run() {
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major * 10 + props.minor >= 75)) {
+    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
+              << std::endl;
+
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+  std::cout << "Running on SM80" << std::endl;
+  run_nonfused_conv2d_fprop_optimized_f16_sm80();
+  run_fused_conv2d_fprop_optimized_f16_sm80();
+  run_nonfused_conv2d_fprop_optimized_s8_sm80();
+  run_fused_conv2d_fprop_optimized_s8_sm80();
+#elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+  std::cout << "Running on SM75" << std::endl;
+  run_nonfused_conv2d_fprop_optimized_f16_sm75();
+  run_fused_conv2d_fprop_optimized_f16_sm75();
+  run_nonfused_conv2d_fprop_optimized_s8_sm75();
+  run_fused_conv2d_fprop_optimized_s8_sm75();
+#endif
+
+  return 0;
+}
+
+int main() {
+
+  bool notSupported = false;
+
+  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
+  //
+  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+    std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+
+    notSupported = true;
+
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major * 10 + props.minor >= 75)) {
+    std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
+              << std::endl;
+
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+    
+  return run();
+}
+
diff --git a/examples/13_fused_two_gemms/fused_gemm.cu b/examples/13_two_tensor_op_fusion/fused_gemm.cu
similarity index 66%
rename from examples/13_fused_two_gemms/fused_gemm.cu
rename to examples/13_two_tensor_op_fusion/fused_gemm.cu
index b96a0ef090..65bad94338 100644
--- a/examples/13_fused_two_gemms/fused_gemm.cu
+++ b/examples/13_two_tensor_op_fusion/fused_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -22,43 +22,22 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*
-
-This example shows fusing two GEMM mainloops into one kernel. The first GEMM computes relu(alpha*A*B) and 
-the second GEMM computes relu(alpha*A*B+beta*C). The performance measuring environment compares against
-two unfused GEMM operations, demonstrating a speedup of the fused kernel on the 
-NVIDIA Turing GPU architecture.
-
-Problem size:
-  GEMM1 (M,N,K): 128*1600, 64, 576
-  GEMM2 (M,N,K): 128*1600, 128, 64
-
-Note that GEMM1_N = GEMM2_K
-
-The example requires the number of threadblocks be the same across 2 GEMMs and 
-thread_block_tile_N = problem_N so the data required by each layer is threadblock-resident. It 
-also requires warp_tile_N = thread_block_tile_N so the data required by each warp is 
-register-file-resident.
-
-Performance:
-  - fp16 on Tesla T4 @ 1590MHz (non-fused vs. fused): 1.39011 ms vs. 1.26035 ms
-  - int8 on Tesla T4 @ 1590MHz (non-fused vs. fused): 0.751759 ms vs. 0.62971 ms
-  - fp16 on Quadro RTX 8000 @ 1890MHz (non-fused vs. fused): 0.721144 ms vs. 0.629864 ms
-  - int8 on Quadro RTX 8000 @ 1890MHz (non-fused vs. fused): 0.379049 ms vs. 0.324764 ms
-  - int8 on GA100 @ 1200MHz (non-fused vs. fused): 0.153795 ms vs. 0.129874 ms
-
-*/
 
 #include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h"
+#include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h"
 #include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h"
 #include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h"
 
 int run() {
 
 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+  std::cout << "Running on SM80" << std::endl;
+  run_nonfused_gemm_f16_sm80();
+  run_fused_gemm_f16_sm80();
   run_nonfused_gemm_s8_sm80();
   run_fused_gemm_s8_sm80();
 #elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+  std::cout << "Running on SM75" << std::endl;
   run_nonfused_gemm_f16();
   run_fused_gemm_f16();
   run_nonfused_gemm_s8();
@@ -74,9 +53,9 @@ int main() {
 
   // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
   //
-  // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
+  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
   if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
-    std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
 
     notSupported = true;
   }
@@ -90,7 +69,7 @@ int main() {
   }
 
   if (!(props.major * 10 + props.minor >= 75)) {
-    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
+    std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
               << std::endl;
 
     notSupported = true;
diff --git a/examples/13_fused_two_gemms/kernel/b2b_gemm.h b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
similarity index 99%
rename from examples/13_fused_two_gemms/kernel/b2b_gemm.h
rename to examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
index a67b1e877c..5627fc319b 100644
--- a/examples/13_fused_two_gemms/kernel/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
new file mode 100644
index 0000000000..9a7b462a38
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
@@ -0,0 +1,475 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Implicit GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename B2bMma_,                               ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct B2bImplicitGemmConvolution {
+
+  using B2bMma = B2bMma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp0 = typename B2bMma::OutputOp;
+  using EpilogueOutputOp1 = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename B2bMma::IteratorA0::Element;
+  using LayoutA = typename B2bMma::IteratorA0::Layout;
+  using ElementB = typename B2bMma::IteratorB0::Element;
+  using LayoutB = typename B2bMma::IteratorB0::Layout;
+  using ElementC = typename EpilogueOutputOp1::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp0::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp0::ElementCompute;
+
+  using WarpMmaOperator0 = typename B2bMma::Policy0::Operator;
+  using WarpMmaOperator1 = typename B2bMma::Policy1::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator0::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator0::OperatorClass;
+  using ArchTag = typename WarpMmaOperator0::ArchTag;
+
+  using ThreadblockShape0 = typename B2bMma::Shape0;
+  using ThreadblockShape1 = typename B2bMma::Shape1;
+  using WarpShape0 = typename WarpMmaOperator0::Shape;
+  using WarpShape1 = typename WarpMmaOperator1::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = B2bMma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = B2bMma::IteratorA0::kIteratorAlgorithm; 
+ 
+  /// Warp count (concept: GemmShape)
+  using WarpCount0 = typename B2bMma::WarpCount0;
+  static int const kThreadCount = 32 * WarpCount0::kCount;
+
+  using TensorRefA0 = typename B2bMma::IteratorA0::TensorRef;
+  using TensorRefB0 = typename B2bMma::IteratorB0::TensorRef;
+  using TensorRefB1 = typename B2bMma::IteratorB1::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::B2bImplicitGemmConvolution::kConvDim
+  static_assert(B2bMma::IteratorA0::kConvDim == B2bMma::IteratorB0::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = B2bMma::IteratorA0::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    cutlass::platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size_0;
+    ConvProblemSize problem_size_1;
+    TensorRefA0 ref_A0;
+    TensorRefB0 ref_B0;
+    TensorRefC ref_C0;
+    TensorRefB1 ref_B1;
+    TensorRefC ref_C1;
+    TensorRefC ref_D1;
+    typename EpilogueOutputOp0::Params output_op_0;
+    typename EpilogueOutputOp1::Params output_op_1;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size_0,
+      ConvProblemSize const & problem_size_1
+    ):
+      problem_size_0(problem_size_0),
+      problem_size_1(problem_size_1) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size_0,
+      ConvProblemSize const & problem_size_1,
+      TensorRefA0 const & ref_A0,
+      TensorRefB0 const & ref_B0,
+      TensorRefC const & ref_C0,
+      TensorRefB1 const & ref_B1,
+      TensorRefC const & ref_C1,
+      TensorRefC const & ref_D1,
+      typename EpilogueOutputOp0::Params const & output_op_0,
+      typename EpilogueOutputOp1::Params const & output_op_1,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size_0(problem_size_0),
+      problem_size_1(problem_size_1),
+      ref_A0(ref_A0),
+      ref_B0(ref_B0),
+      ref_C0(ref_C0),
+      ref_B1(ref_B1),
+      ref_C1(ref_C1),
+      ref_D1(ref_D1),
+      output_op_0(output_op_0),
+      output_op_1(output_op_1),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size_0;
+    ConvProblemSize problem_size_1;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    gemm::GemmCoord implicit_gemm_problem_size_0;
+    gemm::GemmCoord implicit_gemm_problem_size_1;
+    int gemm_k_iterations_0;
+    int gemm_k_iterations_1;
+    typename B2bMma::IteratorA0::Params iterator_A0;
+    typename B2bMma::IteratorA0::Element const *ptr_A0;
+    typename B2bMma::IteratorB0::Params iterator_B0;
+    typename B2bMma::IteratorB0::Element const *ptr_B0;
+    typename Epilogue::OutputTileIterator::Params iterator_C0;
+    typename Epilogue::OutputTileIterator::Element *ptr_C0;
+    typename B2bMma::IteratorB1::Params iterator_B1;
+    typename B2bMma::IteratorB1::Element const *ptr_B1;
+    typename Epilogue::OutputTileIterator::Params iterator_C1;
+    typename Epilogue::OutputTileIterator::Element *ptr_C1;
+    typename Epilogue::OutputTileIterator::Params iterator_D1;
+    typename Epilogue::OutputTileIterator::Element *ptr_D1;
+    typename EpilogueOutputOp0::Params output_op_0;
+    typename EpilogueOutputOp1::Params output_op_1;
+    int *semaphore;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): gemm_k_iterations_0(0), gemm_k_iterations_1(0) { }
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size_0(args.problem_size_0),
+      problem_size_1(args.problem_size_1),
+      implicit_gemm_problem_size_0(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0)),
+      implicit_gemm_problem_size_1(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_1)),
+      grid_tiled_shape(grid_tiled_shape),
+      iterator_A0(B2bMma::IteratorA0::getParams(args.problem_size_0, args.ref_A0.layout())),
+      ptr_A0(args.ref_A0.data()),
+      iterator_B0(args.problem_size_0, args.ref_B0.layout()),
+      ptr_B0(args.ref_B0.data()),
+      iterator_C0(ConvOutputIteratorParameter::layout(args.ref_C0)),
+      ptr_C0(args.ref_C0.data()),
+      iterator_B1(args.problem_size_1, args.ref_B1.layout()),
+      ptr_B1(args.ref_B1.data()),
+      iterator_C1(ConvOutputIteratorParameter::layout(args.ref_C1)),
+      ptr_C1(args.ref_C1.data()),
+      iterator_D1(ConvOutputIteratorParameter::layout(args.ref_D1)),
+      ptr_D1(args.ref_D1.data()),
+      output_op_0(args.output_op_0),
+      output_op_1(args.output_op_1),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode)
+    {
+      gemm_k_iterations_0 = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape0::kK, args.problem_size_0);
+      gemm_k_iterations_1 = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape1::kK, args.problem_size_1);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size_0,
+        {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+        args.problem_size_0.split_k_slices);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename B2bMma::B2bMmaSharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  B2bImplicitGemmConvolution() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename B2bMma::IteratorA0 iterator_A0(
+      params.iterator_A0,
+      params.problem_size_0,
+      params.ptr_A0,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * B2bMma::Shape0::kM,
+        threadblock_tile_idx.k() * B2bMma::Shape0::kK
+      )
+    );
+    
+    typename B2bMma::IteratorB0 iterator_B0(
+      params.iterator_B0,
+      params.problem_size_0,
+      params.ptr_B0,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * B2bMma::Shape0::kK,
+        threadblock_tile_idx.n() * B2bMma::Shape0::kN
+      )
+    );
+
+    typename B2bMma::IteratorB1 iterator_B1(
+      params.iterator_B1,
+      params.problem_size_1,
+      params.ptr_B1,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * B2bMma::Shape1::kK,
+        threadblock_tile_idx.n() * B2bMma::Shape1::kN
+      )
+    );
+
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    EpilogueOutputOp0 output_op_0(params.output_op_0);
+
+    // Construct thread-scoped matrix multiply
+    B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename B2bMma::FragmentC0 src_accum;
+    typename B2bMma::FragmentC1 accumulators;
+
+    src_accum.clear();
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    b2bMma(params.gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_B1, src_accum, output_op_0);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp1 output_op_1(params.output_op_1);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op_1.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * B2bMma::Shape1::kM,
+      threadblock_tile_idx.n() * B2bMma::Shape1::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D1(
+      params.iterator_D1,
+      params.ptr_D1,
+      ConvOutputIteratorParameter::extent(params.problem_size_1),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C1(
+      params.iterator_C1,
+      params.ptr_C1,
+      ConvOutputIteratorParameter::extent(params.problem_size_1),
+      thread_idx,
+      threadblock_offset
+    );
+
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C1 = iterator_D1;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+      __threadfence();
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D1.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size_1));
+    }
+
+    // Run efficient epilogue
+    epilogue(output_op_1, iterator_D1, accumulators, iterator_C1);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
new file mode 100644
index 0000000000..a9813e6d2a
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
@@ -0,0 +1,1281 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "kernel/b2b_implicit_gemm_convolution.h"
+#include "threadblock/b2b_implicit_gemm_pipelined.h"
+#include "threadblock/b2b_implicit_gemm_multistage.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultB2bConv2dFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage 
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, LayoutA,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, LayoutB,
+      ThreadMapB0
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+      ElementB, LayoutB,
+      ThreadMapB1
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistage<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB0
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB1
+    >;
+ 
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistage<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, LayoutA,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, LayoutB,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, LayoutB,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelined<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelined<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm and 
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, LayoutA,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, LayoutB,
+      ThreadMapB0
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+      ElementB, LayoutB,
+      ThreadMapB1
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistage<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and 
+// multistage pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB0
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB1
+    >;
+ 
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistage<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, LayoutA,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, LayoutB,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, LayoutB,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelined<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelined<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
similarity index 76%
rename from examples/13_fused_two_gemms/kernel/default_b2b_gemm.h
rename to examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
index dab9db904c..cdf537566b 100644
--- a/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
@@ -1,29 +1,28 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without
- *modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *notice, this list of conditions and the following disclaimer in the
- *documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its
- *contributors may be used to endorse or promote products derived from this
- *software without specific prior written permission.
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
- *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
- *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
+
 /*! \file
     \brief 
       Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
@@ -118,6 +117,75 @@ template <
 >
 struct DefaultB2bGemm;
 
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultB2bGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape0, ThreadblockShape1,
+                   WarpShape0, WarpShape1, InstructionShape,
+                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1, 
+      InstructionShape, Stages, Operator, EpilogueOutputOp0>::ThreadblockB2bMma;
+
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
+          EpilogueOutputOp1::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization for Turing Architecture
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
new file mode 100644
index 0000000000..8462cfe6f0
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
@@ -0,0 +1,757 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "threadblock/b2b_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA0_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA0_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA0,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB0_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB0_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB0,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile
+    //  (concept::MmaTensorOpFragmentIterator) 
+    typename FragmentIteratorA1_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) 
+    typename OutputOp_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bImplicitGemmMultistage : 
+  public gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages> {
+public:
+  ///< Base class
+  using Base = gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA0 = IteratorA0_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB0 = IteratorB0_;
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over tiles of A operand in global memory
+  using FragmentIteratorA1 = FragmentIteratorA1_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+  ///< Epilogue after 1st Gemm
+  using OutputOp = OutputOp_;
+ 
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+
+  //
+  // Dependent types
+  //
+
+  using ElementC = typename Policy0::Operator::ElementC;
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+  
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations0 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+    static_assert(Base::kWarpGemmIterations1 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA0 =
+        IteratorA0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB0 =
+        IteratorB0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA0 =
+        (AsyncCopyIterationsPerStageA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB0 =
+        (AsyncCopyIterationsPerStageB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (AsyncCopyIterationsPerStageB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
+  using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpLoadedFragmentA1 = typename FragmentIteratorA1::Fragment;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
+  using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bImplicitGemmMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::B2bMmaSharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A0_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx),
+      smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx),
+      smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations0 * warp_idx_k});
+    this->warp_tile_iterator_B0_.add_tile_offset(
+        {Base::kWarpGemmIterations0 * warp_idx_k, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_0(
+    IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
+    int group_start_A0 = 0, int group_start_B0 = 0) {
+
+    iterator_A0.set_iteration_index(group_start_A0);
+    this->smem_iterator_A0_.set_iteration_index(group_start_A0);
+      
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
+
+      if (group_start_A0 + j < Detail::AsyncCopyIterationsPerStageA0) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
+                              IteratorA0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+                dst_ptr, iterator_A0.get(), iterator_A0.valid());
+
+        ++iterator_A0;
+
+        ++this->smem_iterator_A0_;
+      }
+    }
+
+    iterator_B0.set_iteration_index(group_start_B0);
+
+    this->smem_iterator_B0_.set_iteration_index(group_start_B0);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
+      if (group_start_B0 + j < Detail::AsyncCopyIterationsPerStageB0) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
+                              IteratorB0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+                dst_ptr, iterator_B0.get(), iterator_B0.valid());
+
+        ++iterator_B0;
+        ++this->smem_iterator_B0_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(
+    IteratorB1 &iterator_B1,
+    int group_start_B1 = 0) {
+
+    iterator_B1.set_iteration_index(group_start_B1);
+
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::AsyncCopyIterationsPerStageB1) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                              IteratorB1::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+                dst_ptr, iterator_B1.get(), iterator_B1.valid());
+
+        ++iterator_B1;
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_0,
+      ///< destination accumulator tile
+      FragmentC1 &accum,
+      ///< iterator over A operand in global memory
+      IteratorA0 iterator_A0,
+      ///< iterator over B operand in global memory
+      IteratorB0 iterator_B0,
+      ///< iterator over B operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC0 const &src_accum,
+      ///< epilogue operation after 1st Gemm
+      OutputOp output_op_0,
+      ///< Imaginary strides used for planar-complex only - ignored here
+      int64_t imag_stride_A = 0,
+      int64_t imag_stride_B = 0) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_0) {
+
+      iterator_A0.set_iteration_index(0);
+      this->smem_iterator_A0_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA0; ++j) {
+        typename IteratorA0::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorA0::AccessType *>(
+            this->smem_iterator_A0_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorA0::Element>::value *
+            IteratorA0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+          dst_ptr, iterator_A0.get(), iterator_A0.valid());
+
+        ++iterator_A0;
+        ++this->smem_iterator_A0_;
+      }
+
+      iterator_B0.set_iteration_index(0);
+      this->smem_iterator_B0_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB0; ++j) {
+        typename IteratorB0::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB0::AccessType *>(
+              this->smem_iterator_B0_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorB0::Element>::value *
+            IteratorB0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+            dst_ptr, iterator_B0.get(), iterator_B0.valid());
+
+        ++iterator_B0;
+        ++this->smem_iterator_B0_;
+      }
+
+      // Move to the next stage
+      iterator_A0.advance();
+      iterator_B0.advance();
+
+      this->smem_iterator_A0_.add_tile_offset({0, 1});
+      this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
+    WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
+    WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
+    WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
+
+    Operator0 warp_mma0;
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance_0(iterator_A0, iterator_B0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
+                       warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
+    
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        
+        this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k > 0)
+          warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
+                             warp_transformed_frag_B0[warp_mma_k % 2],
+                             warp_loaded_frag_A0[warp_mma_k % 2],
+                             warp_loaded_frag_B0[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A0, group_start_iteration_B0;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations0) {
+          group_start_iteration_A0 = 0;
+          group_start_iteration_B0 = 0;
+        } else {
+          group_start_iteration_A0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
+        }
+
+        copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
+                               group_start_iteration_B0);
+
+        warp_mma0(
+                accum0, 
+                warp_transformed_frag_A0[warp_mma_k % 2],
+                warp_transformed_frag_B0[warp_mma_k % 2], 
+                accum0
+                );
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
+          warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A0.advance();
+          iterator_B0.advance();
+
+          this->smem_iterator_A0_.add_tile_offset({0, 1});
+          this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK *
+                        Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK *
+                     Base::kWarpGemmIterations0,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations_0;
+        }
+      }
+
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+
+    // 2nd Implicit Gemm
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+
+    //
+    // Prologue
+    //
+    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_1) {
+
+      iterator_B1.set_iteration_index(0);
+      this->smem_iterator_B1_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB1; ++j) {
+        typename IteratorB1::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB1::AccessType *>(
+              this->smem_iterator_B1_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorB1::Element>::value *
+            IteratorB1::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+            dst_ptr, iterator_B1.get(), iterator_B1.valid());
+
+        ++iterator_B1;
+        ++this->smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.advance();
+
+      this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], output_op_0);
+    this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
+
+    ++warp_tile_iterator_A1_;
+    ++this->warp_tile_iterator_B1_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance_1(iterator_B1);
+
+    smem_write_stage_idx = Base::kStages - 1;
+    smem_read_stage_idx = 0;
+
+    warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
+                       warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
+
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1 - (Base::kStages - 1); 
+            gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        
+        warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
+        this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
+                             warp_transformed_frag_B1[warp_mma_k % 2],
+                             warp_loaded_frag_A1[warp_mma_k % 2],
+                             warp_loaded_frag_B1[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_B1;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1) {
+          group_start_iteration_B1 = 0;
+        } else {
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+        }
+
+        copy_tiles_and_advance_1(iterator_B1,
+                               group_start_iteration_B1);
+
+        warp_mma1(
+                accum, 
+                warp_transformed_frag_A1[warp_mma_k % 2],
+                warp_transformed_frag_B1[warp_mma_k % 2], 
+                accum
+                );
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.advance();
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+        }
+      }
+
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
new file mode 100644
index 0000000000..b1e929ed23
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
@@ -0,0 +1,483 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "threadblock/b2b_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape0_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA0_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA0_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB0_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB0_,
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape1_,
+  /// Iterates over the intermediate accumulator tile
+  //  (concept::MmaTensorOpFragmentIterator) 
+  typename FragmentIteratorA1_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB1_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB1_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) 
+  typename OutputOp_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy0_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy1_,
+  /// Transformation applied to A operand
+  typename TransformA0_ = NumericArrayConverter<
+    typename SmemIteratorA0_::Element, 
+    typename IteratorA0_::Element, 
+    IteratorA0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to A operand
+  typename TransformB0_ = NumericArrayConverter<
+    typename SmemIteratorB0_::Element, 
+    typename IteratorB0_::Element, 
+    IteratorB0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to A operand
+  typename TransformB1_ = NumericArrayConverter<
+    typename SmemIteratorB1_::Element, 
+    typename IteratorB1_::Element, 
+    IteratorB1_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class B2bImplicitGemmPipelined : public gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
+public:
+
+  ///< Base class
+  using Base = gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2>;
+
+  using Shape0 = Shape0_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA0 = IteratorA0_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB0 = IteratorB0_;     ///< Iterates over tiles of B operand in global memory
+  using Policy0 = Policy0_;           ///< Policy0 describing tuning details
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  using Shape1 = Shape1_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using FragmentIteratorA1 = FragmentIteratorA1_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB1 = IteratorB1_;     ///< Iterates over tiles of B operand in global memory
+  using Policy1 = Policy1_;           ///< Policy1 describing tuning details
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  
+  using OutputOp = OutputOp_;       ///< Epilogue after 1st Gemm
+
+  using TransformA0 = TransformA0_;
+  using TransformB0 = TransformB0_;
+  using TransformB1 = TransformB1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA0 = typename IteratorA0::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB0 = typename IteratorB0::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB1 = typename IteratorB1::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+ 
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy0::Operator::ArchTag;
+
+  /// Complex transform on A0 operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B0 operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B1 operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA0 = typename Operator0::FragmentA;
+  using WarpFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpFragmentA1 = typename FragmentIteratorA1::Fragment;
+  using WarpFragmentB1 = typename Operator1::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B0 operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B1 operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bImplicitGemmPipelined(
+    typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx),
+    smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx),
+    smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    //These may change across different GEMM layers
+    int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k;
+    int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0});
+    this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n});
+
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations_0,                           ///< number of iterations of the mainloop
+    FragmentC1 &accum,                                 ///< destination accumulator tile
+    IteratorA0 iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB0 iterator_B0,                            ///< iterator over B0 operand in global memory
+    IteratorB1 iterator_B1,                            ///< iterator over B1 operand in global memory
+    FragmentC0 const &src_accum,                       ///< source accumulator tile
+    OutputOp output_op_0,                               ///< epilogue operation after 1st Gemm
+    TransformA0 transform_A0 = TransformA0(),            ///< transformation applied to A0 fragment
+    TransformB0 transform_B0 = TransformB0(),           ///< transformation applied to B0 fragment
+    TransformB1 transform_B1 = TransformB1()) {         ///< transformation applied to B1 fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    FragmentA0 tb_frag_A;
+    FragmentB0 tb_frag_B0;
+
+    tb_frag_A.clear();
+    tb_frag_B0.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B0.load(tb_frag_B0);
+
+    ++iterator_A;
+    ++iterator_B0;
+
+    this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+    this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B0_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA0 warp_frag_A0[2];
+    WarpFragmentB0 warp_frag_B0[2];
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    Operator0 warp_mma0;
+
+    int smem_write_stage_idx = 1;
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+
+          this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B0_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        
+        this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B0.load(tb_frag_B0);
+    
+          ++iterator_A;
+          ++iterator_B0;
+        }
+
+        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
+                 warp_frag_B0[warp_mma_k % 2], accum0);
+
+      }
+    }
+
+
+    //2nd Implicit Gemm
+    
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+
+    //
+    // Prologue
+    //
+
+    FragmentB1 tb_frag_B1;
+
+    tb_frag_B1.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_B1.load(tb_frag_B1);
+
+
+    ++iterator_B1;
+
+    this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+    ++this->smem_iterator_B1_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA1 warp_frag_A1[2];
+    WarpFragmentB1 warp_frag_B1[2];
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+
+    warp_tile_iterator_A1_.load(warp_frag_A1[0], output_op_0);
+    this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
+
+    ++warp_tile_iterator_A1_;
+    ++this->warp_tile_iterator_B1_;
+
+    Operator1 warp_mma1;
+
+    smem_write_stage_idx = 1;
+    
+    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_PRAGMA_UNROLL
+    for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
+
+          this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_B1_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK * Base::kWarpGemmIterations1,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        
+        warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
+        this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
+
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_B1.load(tb_frag_B1);
+    
+          ++iterator_B1;
+        }
+
+        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
+                 warp_frag_B1[warp_mma_k % 2], accum);
+
+      }
+    }
+
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
similarity index 99%
rename from examples/13_fused_two_gemms/threadblock/b2b_mma_base.h
rename to examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
index 01cca8b7a2..4293ec3dc9 100644
--- a/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_multistage.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
similarity index 95%
rename from examples/13_fused_two_gemms/threadblock/b2b_mma_multistage.h
rename to examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
index 8782b7af55..f09045a8b8 100644
--- a/examples/13_fused_two_gemms/threadblock/b2b_mma_multistage.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -635,40 +635,9 @@ class B2bMmaMultistage :
          ++stage, --gemm_k_iterations_1) {
 
       if (gemm_k_iterations_1 == 0) {
-//        iterator_A1.clear_mask();
         iterator_B1.clear_mask();
       }
 
-#if 0
-      iterator_A1.set_iteration_index(0);
-      this->smem_iterator_A1_.set_iteration_index(0);
-
-      // LDGSTS for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsA1; ++j) {
-        typename IteratorA1::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA1::AccessType *>(
-                this->smem_iterator_A1_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA1::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorA1::Element>::value *
-              IteratorA1::ThreadMap::kElementsPerAccess /
-              IteratorA1::kAccessesPerVector / 8;
-
-          int src_bytes = (iterator_A0.valid() ? kSrcBytes : 0);
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
-              dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
-
-          ++iterator_A0;
-        }
-
-        ++this->smem_iterator_A0_;
-      }
-#endif
-
       iterator_B1.set_iteration_index(0);
       this->smem_iterator_B1_.set_iteration_index(0);
 
@@ -696,19 +665,14 @@ class B2bMmaMultistage :
       }
 
       // Move to the next stage
-      //iterator_A1.add_tile_offset({0, 1});
       iterator_B1.add_tile_offset({1, 0});
 
-      //this->smem_iterator_A1_.add_tile_offset({0, 1});
       this->smem_iterator_B1_.add_tile_offset({1, 0});
 
       // Defines the boundary of a stage of cp.async.
       cutlass::arch::cp_async_fence();
     }
 
-    // Perform accumulation in the 'd' output operand
-//    FragmentC0 accum0 = src_accum;
-
     // DEPBAR+SYNC
     cutlass::arch::cp_async_wait<Base::kStages - 2>();
     __syncthreads();
@@ -722,7 +686,6 @@ class B2bMmaMultistage :
 
     Operator1 warp_mma1;
 
-//    this->warp_tile_iterator_A1_.set_kgroup_index(0);
     this->warp_tile_iterator_B1_.set_kgroup_index(0);
 
     warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], output_op_0);
@@ -732,7 +695,6 @@ class B2bMmaMultistage :
     ++this->warp_tile_iterator_B1_;
 
     if (gemm_k_iterations_1 == 0) {
-//      iterator_A1.clear_mask();
       iterator_B1.clear_mask();
     }
 
@@ -762,7 +724,6 @@ class B2bMmaMultistage :
         // Load warp-level tiles from shared memory, wrapping to k offset if
         // this is the last group as the case may be.
 
-//        this->warp_tile_iterator_A1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
         this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
         
         warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
@@ -777,6 +738,7 @@ class B2bMmaMultistage :
                              warp_loaded_frag_A1[warp_mma_k % 2],
                              warp_loaded_frag_B1[warp_mma_k % 2]);
 
+
         warp_mma1(
           accum, 
           warp_transformed_frag_A1[warp_mma_k % 2],
@@ -823,7 +785,7 @@ class B2bMmaMultistage :
 
           if (smem_read_stage_idx == (Base::kStages - 1)) {
             this->warp_tile_iterator_B1_.add_tile_offset(
-                {-Base::kStages * Policy0::kPartitionsK *
+                {-Base::kStages * Policy1::kPartitionsK *
                      Base::kWarpGemmIterations1,
                  0});
             smem_read_stage_idx = 0;
@@ -831,7 +793,6 @@ class B2bMmaMultistage :
             ++smem_read_stage_idx;
           }
 
-//          --gemm_k_iterations_1;
           if (gemm_k_iterations_1 == 1) {
             iterator_B1.clear_mask();
           }
diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
similarity index 99%
rename from examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h
rename to examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
index 9887932a37..d6cc9922b5 100644
--- a/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -454,11 +454,11 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
           this->smem_iterator_B1_.store(tb_frag_B1);
 
           __syncthreads();
-          ++smem_iterator_B1_;
+          ++this->smem_iterator_B1_;
 
           // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
           if (smem_write_stage_idx == 1) {
-            smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
           }
           else {
             this->warp_tile_iterator_B1_.add_tile_offset(
diff --git a/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
similarity index 75%
rename from examples/13_fused_two_gemms/threadblock/default_b2b_mma.h
rename to examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
index b3621f56e6..5a95013159 100644
--- a/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -93,7 +93,7 @@ template <
 struct DefaultB2bMma;
 
 ////////////////////////////////////////////////////////////////////////////////
-/// Specialization for row-major output
+/// Specialization for row-major output with 2-stage pipeline
 template <
     /// Element type for A matrix operand
     typename ElementA,
@@ -110,8 +110,6 @@ template <
     /// Element type for internal accumulation
     typename ElementAccumulator,
     /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
     typename ArchTag,
     /// Threadblock-level tile size (concept: GemmShape)
     typename ThreadblockShape0,
@@ -129,7 +127,7 @@ template <
     typename EpilogueOutputOp>
 struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, layout::RowMajor,
-                  OperatorClass, ArchTag, 
+                  arch::OpClassTensorOp, ArchTag, 
                   ThreadblockShape0, ThreadblockShape1,
                   WarpShape0, WarpShape1,
                   InstructionShape, 2, Operator, EpilogueOutputOp, false> {
@@ -137,11 +135,11 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
       ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
-      OperatorClass, 2, Operator>;
+      arch::OpClassTensorOp, 2, Operator>;
   using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
       ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
-      OperatorClass, 2, Operator>;
+      arch::OpClassTensorOp, 2, Operator>;
 
   // Define iterators over tiles from the A operand
   using IteratorA0 =
@@ -162,7 +160,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
           cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
           cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
           MmaCore1::Shape::kK, //kBlocksColumn
-          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp, true>;
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;
 
   // Define iterators over tiles from the B operand
   using IteratorB1 =
@@ -181,9 +179,120 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
       typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
 
 };
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output for multi-stage
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, 
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, Stages, Operator, EpilogueOutputOp, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+ 
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, 
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, 
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using AccessTypeA0 = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+          ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using AccessTypeB0 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+          ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB0>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using AccessTypeB1 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+          ElementB, LayoutB, 0, ThreadMapB1, AccessTypeB1>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistage<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      MmaCore0::kCacheOpA, 
+      IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB, 
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB,
+      ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages>;
+
+};
+
+
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Specialization for column-major-interleaved output
+/// Specialization for column-major-interleaved output with 2-stage pipeline
 template <
     /// Element type for A matrix operand
     typename ElementA,
@@ -258,7 +367,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
           cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
           MmaCore1::Shape::kK, //kBlocksColumn
           ElementAccumulator, ElementA, AccumulatorLayout, 
-          InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>;
+          InstructionShape, EpilogueOutputOp>;
 
   // Define iterators over tiles from the B operand
   using IteratorB1 =
@@ -281,7 +390,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Specialization for column-major-interleaved output
+/// Specialization for column-major-interleaved output with multi-stage
 template <
     /// Element type for A matrix operand
     typename ElementA,
@@ -360,7 +469,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
           cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
           MmaCore1::Shape::kK, //kBlocksColumn
           ElementAccumulator, ElementA, AccumulatorLayout, 
-          InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>;
+          InstructionShape, EpilogueOutputOp>;
 
   // Define iterators over tiles from the B operand
   using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
diff --git a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
index 49e1a4f9e3..c8cad3ae72 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
+++ b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
index 84eadc5eab..58f5a87405 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
+++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -191,8 +191,12 @@ int run() {
   // Instantiate CUTLASS kernel depending on templates
   Gemm gemm_op;
 
+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
   // Initialize CUTLASS kernel with arguments and workspace pointer
-  cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+  status = gemm_op.initialize(arguments, workspace.get());
   CUTLASS_CHECK(status);
 
   // Launch initialized CUTLASS kernel
@@ -258,7 +262,7 @@ int main() {
   }
 
   if (!((props.major * 10 + props.minor) >= 80)) {
-    std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 80."
+    std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
               << std::endl;
     notSupported = true;
   }
diff --git a/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt b/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
index 2d0929c3a8..ce786e653f 100644
--- a/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
+++ b/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
index 1b233c488b..c88a889b01 100644
--- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
+++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -223,8 +223,12 @@ int run() {
   // Instantiate CUTLASS kernel depending on templates
   Gemm gemm_op;
 
+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
   // Initialize CUTLASS kernel with arguments and workspace pointer
-  cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+  status = gemm_op.initialize(arguments, workspace.get());
   CUTLASS_CHECK(status);
 
   // Launch initialized CUTLASS kernel
diff --git a/examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt b/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
similarity index 94%
rename from examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt
rename to examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
index 1b7daac3dc..42db35fa14 100644
--- a/examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt
+++ b/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -22,7 +22,7 @@
 
 
 cutlass_example_add_executable(
-  22_ampere_tensorop_conv2dfprop
+  16_ampere_tensorop_conv2dfprop
   ampere_tensorop_conv2dfprop.cu
   )
 
diff --git a/examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
similarity index 97%
rename from examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
rename to examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
index cb7c398661..4c417bc60b 100644
--- a/examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
+++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -318,7 +318,7 @@ struct Options {
   /// Prints the usage statement.
   std::ostream & print_usage(std::ostream &out) const {
 
-    out << "22_ampere_tensorop_conv2dfprop example\n\n"
+    out << "16_ampere_tensorop_conv2dfprop example\n\n"
       << "  This example uses Ampere's Tensor Core operators on F16 data types to compute\n"
       << "  forward convolution on tensors of layout NHWC.\n\n"
       << "Options:\n\n"
@@ -340,8 +340,8 @@ struct Options {
       << "  --tag <string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
-      << "$ ./examples/22_ampere_tensorop_conv2dfprop/22_ampere_tensorop_conv2dfprop  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
-      << "$ ./examples/22_ampere_tensorop_conv2dfprop/22_ampere_tensorop_conv2dfprop  --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
+      << "$ ./examples/16_ampere_tensorop_conv2dfprop/16_ampere_tensorop_conv2dfprop  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
+      << "$ ./examples/16_ampere_tensorop_conv2dfprop/16_ampere_tensorop_conv2dfprop  --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
 
     return out;
   }
@@ -474,8 +474,8 @@ Result profile_convolution(Options const &options) {
   // Split K dimension into 1 partitions
   int split_k_slices = 1;
 
-  typename ImplicitGemm::Arguments arguments{
-    {
+  // Construct Conv2dProblemSize with user defined output size
+  cutlass::conv::Conv2dProblemSize problem_size(      
       options.input_size,
       options.filter_size,
       options.padding,
@@ -483,15 +483,18 @@ Result profile_convolution(Options const &options) {
       options.dilation,
       options.output_size(),
       mode,
-      split_k_slices 
-    },
+      split_k_slices
+  );
+
+  // Construct ImplicitGemm::Argument structure with conv2d 
+  // problem size, data pointers, and epilogue values
+  typename ImplicitGemm::Arguments arguments{
+    problem_size,
     tensor_a.device_ref(),
     tensor_b.device_ref(),
     tensor_c.device_ref(),
     tensor_c.device_ref(),
     {options.alpha, options.beta},
-
-    
   };
 
   //
@@ -505,6 +508,9 @@ Result profile_convolution(Options const &options) {
   // Allocate workspace memory
   cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
 
+  result.status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(result.status);
+
   result.status = implicit_gemm_op.initialize(arguments, workspace.get());
   CUTLASS_CHECK(result.status);
 
@@ -522,15 +528,6 @@ Result profile_convolution(Options const &options) {
   if (options.reference_check) {
     std::cout << "Verification on host...\n";
 
-    cutlass::conv::Conv2dProblemSize problem_size(
-      options.input_size,
-      options.filter_size,
-      options.padding,
-      options.conv_stride,
-      options.dilation,
-      mode
-    );
-
     // Compute with reference implementation
     cutlass::reference::host::Conv2dFprop<
       ElementInputA,
@@ -576,7 +573,7 @@ Result profile_convolution(Options const &options) {
 
     std::stringstream ss;
 
-    ss << "22_ampere_workspace_conv2dfprop_"
+    ss << "16_ampere_workspace_conv2dfprop_"
       << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() 
       << "_"
       << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() 
@@ -667,7 +664,7 @@ int main(int argc, char const **args) {
 
   bool notSupported = false;
 
-  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
+  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
   //
   // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
   if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
diff --git a/examples/13_fused_two_gemms/CMakeLists.txt b/examples/17_fprop_per_channel_bias/CMakeLists.txt
similarity index 90%
rename from examples/13_fused_two_gemms/CMakeLists.txt
rename to examples/17_fprop_per_channel_bias/CMakeLists.txt
index ba51537ca2..726f0d202d 100644
--- a/examples/13_fused_two_gemms/CMakeLists.txt
+++ b/examples/17_fprop_per_channel_bias/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -20,14 +20,9 @@
 # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-cutlass_example_add_executable(
-  13_fused_two_gemms
-  fused_gemm.cu
-  )
 
-target_include_directories(
-  13_fused_two_gemms
-  PRIVATE
-  .
+cutlass_example_add_executable(
+  17_fprop_per_channel_bias 
+  fprop_per_channel_bias.cu
   )
 
diff --git a/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu b/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
new file mode 100644
index 0000000000..db504935ba
--- /dev/null
+++ b/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
@@ -0,0 +1,300 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+The convolution version of 12_gemm_bias_relu.  Similarly, we put bias vector in Operand C and the
+rest is the same as normal convolution.
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/host_reorder.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output tensors and computation between
+// elements 
+using ElementAccumulator = float;                  // Data type of accumulator
+using ElementComputeEpilogue = ElementAccumulator; // Data type of epilogue computation
+using ElementInputA = cutlass::half_t;             // Data type of elements in input tensor
+using ElementInputB = cutlass::half_t;             // Data type of elements in input tensor
+using ElementOutput = float;                       // Data type of elements in output tensor
+
+using LayoutInputA = cutlass::layout::TensorNHWC;
+using LayoutInputB = cutlass::layout::TensorNHWC;
+using LayoutOutput = cutlass::layout::TensorNHWC;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;  // Threadblock tile shape
+
+// This code section describes tile size a warp will compute
+using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;         // Warp tile shape
+
+// This code section describes the size of MMA op
+using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;    // TensorCore instruction shape
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 4;
+
+// This code section describe iterator algorithm selected is Analytic or Optimized
+static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;
+
+// This code section describes the epilogue part of the kernel, we use default value
+using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu<
+    ElementOutput,                                        // Data type of output matrix.
+    128 / cutlass::sizeof_bits<ElementOutput>::value,     // The number of elements per vectorized.
+                                                          // memory access. This becomes the vector width of
+                                                          // math instructions in the epilogue too.
+    ElementAccumulator,                                   // Data type of accumulator
+    ElementComputeEpilogue,                               // Data type for alpha in linear combination
+    cutlass::epilogue::thread::ScaleType::NoBetaScaling>; // alpha X C + per channel bias
+
+
+using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAdd,
+  IteratorAlgorithm
+>::Kernel;
+
+using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int run() {
+
+  // Construct Conv2dProblemSize with user defined output size
+  cutlass::conv::Conv2dProblemSize problem_size(      
+    {1, 7, 7, 512},                               // activation 
+    {512, 3, 3, 512},                             // filter
+    {1, 1, 1, 1},                                 // padding
+    {1, 1},                                       // striding
+    {1, 1},                                       // dilation
+    cutlass::conv::Mode::kCrossCorrelation,       // mode (convolution or cross-correlation)
+    1                                             // split-k slices
+  );
+
+  // Initialize tensors using CUTLASS helper functions
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(problem_size.activation_extent());
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(problem_size.filter_extent());
+
+  // Create tensor C with dimensions 1x1x1xk which is the bias vector
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c_bias({1, 1, 1, problem_size.K});
+
+  // Create tensor D used to store output from CUTLASS kernel
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(problem_size.output_extent());
+  // Create matrix D with dimensions M x N used to store output from reference
+  // kernel
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(problem_size.output_extent());
+
+  // Fill input and output matrices on host using CUTLASS helper functions
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(4),
+      ElementInputA(-4),
+      0);  // <- Fill tensor A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(4),
+      ElementInputB(-4),
+      0);  // <- Fill tensor B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c_bias.host_view(),
+      1,
+      ElementOutput(4),
+      ElementOutput(-4),
+      0);  // <- Fill matrix C on host with uniform-distribution random data
+  cutlass::reference::host::TensorFill(
+      tensor_d.host_view());  // <- fill matrix D on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());  // <- fill matrix D for reference on host with zeros
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c_bias.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+
+  // Initialize alpha for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated CUTLASS kernel
+  typename ImplicitGemm::Arguments arguments{
+    problem_size,
+    tensor_a.device_ref(),              // <- reference to tensor A on device
+    tensor_b.device_ref(),              // <- reference to tensor B on device
+    // tensor C  is treated as the bias vector. We can enable the CONV
+    // to project away the N, H, W dimension by setting the stride to zero.
+    {tensor_c_bias.device_data(), LayoutOutput::Stride(0)},
+    tensor_d.device_ref(),              // <- reference to tensor D on device
+    {alpha} };                    
+
+  // Instantiate CUTLASS kernel depending on templates
+  ImplicitGemm implicit_gemm_op;
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check the problem size is supported or not
+  cutlass::Status status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  status = implicit_gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(status);
+
+  // Launch initialized CUTLASS kernel
+  status = implicit_gemm_op();
+
+  CUTLASS_CHECK(status);
+
+  //
+  // Create instantiation for device reference conv kernel
+  //
+
+  // Launch device reference to compute strictly the product A * B
+  cutlass::reference::device::Conv2d<
+      ElementInputA, 
+      LayoutInputA, 
+      ElementInputB, 
+      LayoutInputB, 
+      ElementOutput,
+      LayoutOutput, 
+      ElementComputeEpilogue, 
+      ElementAccumulator,
+      cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>>
+    (
+      cutlass::conv::Operator::kFprop, 
+      problem_size, 
+      tensor_a.device_ref(),
+      tensor_b.device_ref(), 
+      tensor_c_bias.device_ref(), 
+      tensor_ref_d.device_ref(),
+      alpha, 0
+    );
+
+  // Wait for kernels to finish
+  cudaDeviceSynchronize();
+
+  // Copy output data from CUTLASS and reference kernel to host for comparison
+  tensor_d.sync_host();
+  tensor_ref_d.sync_host();
+
+  // Compute bias + relu in host code
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int p = 0; p < problem_size.P; ++p) {
+      for (int q = 0; q < problem_size.Q; ++q) {
+        for (int k = 0; k < problem_size.K; ++k) {
+          
+          tensor_ref_d.at({n, p, q, k}) =
+              std::max(ElementOutput(0),
+                       ElementOutput(tensor_ref_d.at({n, p, q, k}) +
+                                     tensor_c_bias.at({0, 0, 0, k})));
+        }
+      }
+    }
+  }
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(),
+                                                       tensor_ref_d.host_view())
+                    ? "Passed"
+                    : "Failed")
+            << std::endl;
+
+  CUTLASS_CHECK(status);
+  return 0;
+}
+
+int main(int argc, char const **args) {
+
+  bool notSupported = false;
+
+  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
+  //
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+    std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    return 0;
+  }
+
+  return run();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d51df92c70..e5bfb78ca5 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -78,10 +78,11 @@ foreach(EXAMPLE
   10_planar_complex
   11_planar_complex_array
   12_gemm_bias_relu
-  13_fused_two_gemms
+  13_two_tensor_op_fusion  
   14_ampere_tf32_tensorop_gemm
   15_ampere_sparse_tensorop_gemm
-  22_ampere_tensorop_conv2dfprop
+  16_ampere_tensorop_conv2dfprop
+  17_fprop_per_channel_bias
   )
 
   add_subdirectory(${EXAMPLE})
diff --git a/include/cutlass/aligned_buffer.h b/include/cutlass/aligned_buffer.h
index 8b3bb0713d..75163cae50 100644
--- a/include/cutlass/aligned_buffer.h
+++ b/include/cutlass/aligned_buffer.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h
index eb0a2ad43b..14b5c9d22a 100644
--- a/include/cutlass/arch/arch.h
+++ b/include/cutlass/arch/arch.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/cache_operation.h b/include/cutlass/arch/cache_operation.h
index 646b51ded3..d6435fa9bf 100644
--- a/include/cutlass/arch/cache_operation.h
+++ b/include/cutlass/arch/cache_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h
index d9f386eec7..4abaf0d858 100644
--- a/include/cutlass/arch/memory.h
+++ b/include/cutlass/arch/memory.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -187,10 +187,10 @@ struct global_load<AccessType,
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <
-    /// Fragment type to store loaded data
+    /// Fragment type to store data
     typename AccessType,
-    /// The bytes of loading
-    int LoadBytes
+    /// The bytes of storing
+    int StoreBytes
     >
 struct global_store;
 
@@ -294,7 +294,6 @@ struct global_store<AccessType, 1> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-
 } // namespace arch
 } // namespace cutlass
 
diff --git a/include/cutlass/arch/memory_sm75.h b/include/cutlass/arch/memory_sm75.h
index 3fd121b903..a541bc9ddd 100644
--- a/include/cutlass/arch/memory_sm75.h
+++ b/include/cutlass/arch/memory_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -73,7 +73,7 @@ inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
 #endif
 */
 
-#if (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
+#if (! defined (__clang__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
   extern "C" {
   //
   // This NVVM intrinsic is subject to change in future versions of CUDA.
@@ -91,7 +91,7 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
 
 // We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to
 // the previous internal intrinsics if they are available.
-#if (defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11)
+#if (! defined (__clang__) && defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11)
   //
   // This NVVM intrinsic converts an address in shared memory to a plain
   // unsigned integer. This is necessary to pass to shared memory instructions
@@ -104,7 +104,7 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
   /// CUTLASS helper to get SMEM pointer
   return static_cast<unsigned>(__cvta_generic_to_shared(ptr));
 
-#elif (defined(__CUDA_ARCH__) &&  __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
+#elif (! defined (__clang__) && defined(__CUDA_ARCH__) &&  __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
 
   return __nvvm_get_smem_pointer(ptr);
 
@@ -120,7 +120,10 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
 
 #else
 
-  return 0;
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
+    return 0;
+
 #endif
 }
   
@@ -146,7 +149,9 @@ inline __device__ void ldsm<layout::RowMajor, 1>(
 
   #else
 
-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
 
   #endif
 }
@@ -168,7 +173,9 @@ inline __device__ void ldsm<layout::RowMajor, 2>(
 
   #else
 
-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
 
   #endif
 }
@@ -190,7 +197,9 @@ inline __device__ void ldsm<layout::RowMajor, 4>(
 
   #else
 
-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
 
   #endif
 }
@@ -216,7 +225,9 @@ inline __device__ void ldsm<layout::ColumnMajor, 1>(
 
   #else
 
-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
 
   #endif
 }
@@ -238,7 +249,9 @@ inline __device__ void ldsm<layout::ColumnMajor, 2>(
 
   #else
 
-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
 
   #endif
 }
@@ -260,7 +273,9 @@ inline __device__ void ldsm<layout::ColumnMajor, 4>(
 
   #else
 
-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
 
   #endif
 }
diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h
index 045196cb8f..1b5bb10bb7 100644
--- a/include/cutlass/arch/memory_sm80.h
+++ b/include/cutlass/arch/memory_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -74,15 +74,16 @@ template <
     /// Size of the access in bytes
     int SizeInBytes>
 struct cp_async<SizeInBytes, CacheOperation::Always> {
-  // Make sure the size is supported.
-  static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
-                "Size is not supported");
 
   /// Copy
   CUTLASS_DEVICE
   cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
     #if CUDA_CP_ASYNC_ACTIVATED
-    
+ 
+      // Make sure the size is supported.
+      static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
+                "Size is not supported");
+   
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
 
       asm volatile(
@@ -108,15 +109,16 @@ template <
     /// Size of the access in bytes
     int SizeInBytes>
 struct cp_async_zfill<SizeInBytes, CacheOperation::Always> {
-  // Make sure the size is supported.
-  static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
-                "Size is not supported");
 
   /// Copy with zero fill
   CUTLASS_DEVICE
   cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
     #if CUDA_CP_ASYNC_ACTIVATED
-    
+
+      // Make sure the size is supported.
+      static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
+                "Size is not supported");
+   
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
       int src_in_bytes = (pred_guard ? SizeInBytes : 0);
 
@@ -146,16 +148,13 @@ template <
     /// Size of the access in bytes
     int SizeInBytes>
 struct cp_async<SizeInBytes, CacheOperation::Global> {
-  // Make sure the size is supported.
-  static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
-                "Size is not supported");
 
   /// Copy
   CUTLASS_DEVICE
   cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
     #if CUDA_CP_ASYNC_ACTIVATED
     
-      static_assert(SizeInBytes == 16, 
+      static_assert(SizeInBytes == 16,
         "cp.async only supports CacheOperation::Global when access size is 16B.");
 
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
@@ -183,16 +182,13 @@ template <
     /// Size of the access in bytes
     int SizeInBytes>
 struct cp_async_zfill<SizeInBytes, CacheOperation::Global> {
-  // Make sure the size is supported.
-  static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
-                "Size is not supported");
 
   /// Copy with zero fill
   CUTLASS_DEVICE
   cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
     #if CUDA_CP_ASYNC_ACTIVATED
 
-      static_assert(SizeInBytes == 16, 
+      static_assert(SizeInBytes == 16,
         "cp.async only supports CacheOperation::Global when access size is 16B.");
 
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h
index 729cd17917..1672e60713 100644
--- a/include/cutlass/arch/mma.h
+++ b/include/cutlass/arch/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/mma_sm50.h b/include/cutlass/arch/mma_sm50.h
index cc4a94b17e..fa8e1949ec 100644
--- a/include/cutlass/arch/mma_sm50.h
+++ b/include/cutlass/arch/mma_sm50.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/mma_sm60.h b/include/cutlass/arch/mma_sm60.h
index 5c82f74ec3..1b18609690 100644
--- a/include/cutlass/arch/mma_sm60.h
+++ b/include/cutlass/arch/mma_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/mma_sm61.h b/include/cutlass/arch/mma_sm61.h
index 6cbe260633..5ee65c2574 100644
--- a/include/cutlass/arch/mma_sm61.h
+++ b/include/cutlass/arch/mma_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/mma_sm70.h b/include/cutlass/arch/mma_sm70.h
index b03ce2c1de..213d6bb54e 100644
--- a/include/cutlass/arch/mma_sm70.h
+++ b/include/cutlass/arch/mma_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h
index c5e0db9720..62015d3dd7 100644
--- a/include/cutlass/arch/mma_sm75.h
+++ b/include/cutlass/arch/mma_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/mma_sm80.h b/include/cutlass/arch/mma_sm80.h
index 289c205cad..c4fdaedf5d 100644
--- a/include/cutlass/arch/mma_sm80.h
+++ b/include/cutlass/arch/mma_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -112,7 +112,13 @@ struct Mma<
   );
 
 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
   }
 };
@@ -178,7 +184,13 @@ struct Mma<
   );
 
 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
   }
 };
@@ -230,7 +242,13 @@ struct Mma<gemm::GemmShape<16, 8, 8>, 32, tfloat32_t, layout::RowMajor,
           "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
 
 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
   }
 };
@@ -291,7 +309,13 @@ struct Mma<
   );
 
 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
   }
 };
@@ -352,7 +376,13 @@ struct Mma<
           "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
 
 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
   }
 };
@@ -413,7 +443,13 @@ struct Mma<
           "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
 
 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
   }
 };
@@ -472,7 +508,13 @@ struct Mma<
       : "d"(A), "d"(B), "d"(C[0]), "d"(C[1]));
 
 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+    
 #endif
   }
 };
diff --git a/include/cutlass/arch/mma_sparse_sm80.h b/include/cutlass/arch/mma_sparse_sm80.h
index a93fd2924c..8d3aaaf0a6 100644
--- a/include/cutlass/arch/mma_sparse_sm80.h
+++ b/include/cutlass/arch/mma_sparse_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/simd.h b/include/cutlass/arch/simd.h
index 2503094ad3..4e7265c403 100644
--- a/include/cutlass/arch/simd.h
+++ b/include/cutlass/arch/simd.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/simd_sm60.h b/include/cutlass/arch/simd_sm60.h
index 36030a3661..277cf1af36 100644
--- a/include/cutlass/arch/simd_sm60.h
+++ b/include/cutlass/arch/simd_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/simd_sm61.h b/include/cutlass/arch/simd_sm61.h
index 94f1c617c3..3f7b2d8ae3 100644
--- a/include/cutlass/arch/simd_sm61.h
+++ b/include/cutlass/arch/simd_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/wmma.h b/include/cutlass/arch/wmma.h
index 0a556aee3a..fa6d288a61 100644
--- a/include/cutlass/arch/wmma.h
+++ b/include/cutlass/arch/wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/wmma_sm70.h b/include/cutlass/arch/wmma_sm70.h
index 94eeb93deb..55af75a4ae 100644
--- a/include/cutlass/arch/wmma_sm70.h
+++ b/include/cutlass/arch/wmma_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/wmma_sm72.h b/include/cutlass/arch/wmma_sm72.h
index 1b8cc1161e..9e79d16ad4 100644
--- a/include/cutlass/arch/wmma_sm72.h
+++ b/include/cutlass/arch/wmma_sm72.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/wmma_sm75.h b/include/cutlass/arch/wmma_sm75.h
index f630712fc6..e0d15bf4a7 100644
--- a/include/cutlass/arch/wmma_sm75.h
+++ b/include/cutlass/arch/wmma_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/array.h b/include/cutlass/array.h
index 3faa11d022..4eee99602b 100644
--- a/include/cutlass/array.h
+++ b/include/cutlass/array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/array_planar_complex.h b/include/cutlass/array_planar_complex.h
index e2dbbc47cb..0d9a94a987 100644
--- a/include/cutlass/array_planar_complex.h
+++ b/include/cutlass/array_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/array_subbyte.h b/include/cutlass/array_subbyte.h
index 78081facc7..81008df727 100644
--- a/include/cutlass/array_subbyte.h
+++ b/include/cutlass/array_subbyte.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -45,9 +45,6 @@ template <
 class Array<T, N, false> {
 public:
 
-  static_assert(sizeof_bits<T>::value * N >= 8,
-    "Array<> specialized for sub-byte types assume the actual stored element size is 1 byte");
-
   static int const kSizeBits = sizeof_bits<T>::value * N;
 
   /// Storage type
diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h
index 3a4b8bd76e..fc32a509ad 100644
--- a/include/cutlass/bfloat16.h
+++ b/include/cutlass/bfloat16.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h
index 7c0ab3b4f3..3312619cbb 100644
--- a/include/cutlass/complex.h
+++ b/include/cutlass/complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -52,6 +52,23 @@ enum class ComplexTransform {
   kConjugate
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines ComplexTransform inversions
+template <ComplexTransform kTransform>
+struct InvertComplexTransform;
+
+/// Invert ComplexTransform from kNone to kConjugate
+template <>
+struct InvertComplexTransform<ComplexTransform::kNone> {
+  static ComplexTransform const transform = ComplexTransform::kConjugate;
+};
+
+/// Invert ComplexTransform from kConjugate to kNone
+template <>
+struct InvertComplexTransform<ComplexTransform::kConjugate> {
+  static ComplexTransform const transform = ComplexTransform::kNone;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////
 
 //
@@ -291,6 +308,30 @@ CUTLASS_HOST_DEVICE T &imag(complex<T> &z) {
   return z.imag();
 }
 
+/// Returns the real part of the real number
+template <typename T>
+CUTLASS_HOST_DEVICE T const &real(T const &r) {
+  return r;
+}
+
+/// Returns the real part of the real number
+template <typename T>
+CUTLASS_HOST_DEVICE T &real(T &r) {
+  return r;
+}
+
+/// Returns the imaginary part of the real number
+template <typename T>
+CUTLASS_HOST_DEVICE T const &imag(T const &r) {
+  return T();
+}
+
+/// Returns the imaginary part of the complex number
+template <typename T>
+CUTLASS_HOST_DEVICE T &imag(T &r) {
+  return T();
+}
+
 //
 // Output operators
 //
diff --git a/include/cutlass/constants.h b/include/cutlass/constants.h
index 690891b227..9666b2b9d5 100644
--- a/include/cutlass/constants.h
+++ b/include/cutlass/constants.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.                               
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.                               
  *                                                                                                  
  * Redistribution and use in source and binary forms, with or without modification, are permitted   
  * provided that the following conditions are met:                                                  
diff --git a/include/cutlass/conv/conv2d_problem_size.h b/include/cutlass/conv/conv2d_problem_size.h
index 735103722d..fd87e1acdd 100644
--- a/include/cutlass/conv/conv2d_problem_size.h
+++ b/include/cutlass/conv/conv2d_problem_size.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/conv3d_problem_size.h b/include/cutlass/conv/conv3d_problem_size.h
index 91827d2724..495fcc3bf2 100644
--- a/include/cutlass/conv/conv3d_problem_size.h
+++ b/include/cutlass/conv/conv3d_problem_size.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -188,7 +188,7 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
       mode, split_k_slices, groups
     ) { 
       // set output Z
-      Z = ((D + pad_d - T * dilation_d) / stride_d) + 1;      
+      Z = ((D + pad_d * 2 - T * dilation_d) / stride_d) + 1;      
     }
 
   /// Equality operator (ignores mode and split_k_slice)
diff --git a/include/cutlass/conv/convolution.h b/include/cutlass/conv/convolution.h
index c743ea6faa..95afe94f57 100644
--- a/include/cutlass/conv/convolution.h
+++ b/include/cutlass/conv/convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/device/implicit_gemm_convolution.h b/include/cutlass/conv/device/implicit_gemm_convolution.h
index 0aa03d1997..2e5e3b0c82 100644
--- a/include/cutlass/conv/device/implicit_gemm_convolution.h
+++ b/include/cutlass/conv/device/implicit_gemm_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/kernel/default_conv2d.h b/include/cutlass/conv/kernel/default_conv2d.h
index 57fae79655..603856a4f1 100644
--- a/include/cutlass/conv/kernel/default_conv2d.h
+++ b/include/cutlass/conv/kernel/default_conv2d.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -41,7 +41,6 @@
 #include "cutlass/conv/threadblock/implicit_gemm_pipelined.h"
 #include "cutlass/conv/threadblock/implicit_gemm_multistage.h"
 #include "cutlass/conv/kernel/implicit_gemm_convolution.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -101,4 +100,3 @@ struct DefaultConvEpilogue<
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/include/cutlass/conv/kernel/default_conv2d_dgrad.h b/include/cutlass/conv/kernel/default_conv2d_dgrad.h
index c590f57efc..f81c389728 100644
--- a/include/cutlass/conv/kernel/default_conv2d_dgrad.h
+++ b/include/cutlass/conv/kernel/default_conv2d_dgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop.h b/include/cutlass/conv/kernel/default_conv2d_fprop.h
index c38d5150b1..d22fb7f0ba 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -1376,4 +1376,3 @@ struct DefaultConv2dFprop <
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/include/cutlass/conv/kernel/default_conv2d_wgrad.h b/include/cutlass/conv/kernel/default_conv2d_wgrad.h
index c7912203a4..1bb68689d0 100644
--- a/include/cutlass/conv/kernel/default_conv2d_wgrad.h
+++ b/include/cutlass/conv/kernel/default_conv2d_wgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/kernel/default_conv3d_dgrad.h b/include/cutlass/conv/kernel/default_conv3d_dgrad.h
index a92b4bfb6a..475cceecc6 100644
--- a/include/cutlass/conv/kernel/default_conv3d_dgrad.h
+++ b/include/cutlass/conv/kernel/default_conv3d_dgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -34,6 +34,9 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/conv/kernel/default_conv2d.h"
 
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h"
+
 #include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h"
 #include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h"
 #include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
@@ -45,7 +48,7 @@ namespace conv {
 namespace kernel {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dDgrad
+/// Defines a kernel for Conv3dDgrad
 template <
   typename ElementA,
   typename LayoutA,
@@ -67,7 +70,7 @@ template <
   conv::StrideSupport StrideSupport = StrideSupport::kStrided
 > struct DefaultConv3dDgrad;
 
-/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided
+/// Defines a kernel for Conv3dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided
 // and multistage pipeline.
 template <
   typename ElementA,
@@ -174,6 +177,117 @@ struct DefaultConv3dDgrad <
 };
 
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dDgrad specialzation for Optimized IteratorAlgorithm Dgrad Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace kernel
diff --git a/include/cutlass/conv/kernel/default_conv3d_fprop.h b/include/cutlass/conv/kernel/default_conv3d_fprop.h
index 7694c8b9e8..5660458855 100644
--- a/include/cutlass/conv/kernel/default_conv3d_fprop.h
+++ b/include/cutlass/conv/kernel/default_conv3d_fprop.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -34,6 +34,10 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/conv/kernel/default_conv2d.h"
 
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
+
+
 #include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h"
 #include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
 
@@ -68,6 +72,113 @@ template <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Defines a kernel for Conv3dFprop specialization for Analytic Iterator Algorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage
 // pipeline.
 template <
@@ -173,6 +284,114 @@ struct DefaultConv3dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Defines a kernel for Conv3dFprop specialzation for Optimized IteratorAlgorithm and multistage
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB; 
+
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace kernel
 } // namespace conv
 } // namespace cutlass
diff --git a/include/cutlass/conv/kernel/default_conv3d_wgrad.h b/include/cutlass/conv/kernel/default_conv3d_wgrad.h
index b0f5b91558..2f7ea86807 100644
--- a/include/cutlass/conv/kernel/default_conv3d_wgrad.h
+++ b/include/cutlass/conv/kernel/default_conv3d_wgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
index 2ec1566889..fbc44b15b0 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -216,8 +216,7 @@ struct ImplicitGemmConvolution {
     ):
       problem_size(args.problem_size),
       implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
-      grid_tiled_shape(grid_tiled_shape),
-      iterator_A(args.problem_size, args.ref_A.layout()),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
       ptr_A(args.ref_A.data()),
       iterator_B(args.problem_size, args.ref_B.layout()),
       ptr_B(args.ref_B.data()),
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
index 14c8a4e829..8afb4968b1 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
index f76dcde931..937216d5e6 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
index d32da7c3bf..e33e4ccb23 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -186,6 +186,11 @@ class Conv2dDgradOutputGradientTileAccessIteratorAnalytic <
     }
   }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
   /// Overrides the internal iteration index
   CUTLASS_HOST_DEVICE
   void set_iteration_index(Index index) {
@@ -402,6 +407,11 @@ class Conv2dDgradOutputGradientTileAccessIteratorAnalytic <
     }
   }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
   /// Overrides the internal iteration index
   CUTLASS_HOST_DEVICE
   void set_iteration_index(Index index) {
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
index 71299cf578..078c9e7fc1 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -98,35 +98,7 @@ class Conv2dDgradOutputGradientTileAccessIteratorOptimized {
   // Parameters structure
   //
 
-  struct Params : Conv2dDgradOutputGradientIteratorOptimizedParams {
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    
-    CUTLASS_HOST_DEVICE
-    Params(Conv2dDgradOutputGradientIteratorOptimizedParams const &base): 
-      Conv2dDgradOutputGradientIteratorOptimizedParams(base) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv2dProblemSize const &problem_size, 
-      Layout const &layout
-    ):
-      Conv2dDgradOutputGradientIteratorOptimizedParams(
-        problem_size,
-        layout,
-        sizeof_bits<Element>::value,
-        {Shape::kRow, Shape::kColumn},
-        ThreadMap::kThreads,
-        ThreadMap::kElementsPerAccess,
-        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
-      ) { }
-  };
+  using Params = Conv2dDgradOutputGradientIteratorOptimizedParams;
 
 private:
 
@@ -239,10 +211,22 @@ class Conv2dDgradOutputGradientTileAccessIteratorOptimized {
     set_iteration_index(0);
   }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
 private:
 
   /// Returns the coordinate in the output gradient tensor dy that is correspoinding to 
-  // output nhw and filter position k, r, s
+  // activation nhw and filter position k, r, s
   CUTLASS_HOST_DEVICE
   TensorCoord at_(int n, int h, int w, int r, int s) const {
 
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
index 92dd705d6b..51a5150456 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -146,6 +146,11 @@ class Conv2dFpropActivationTileAccessIteratorAnalytic {
     set_iteration_index(0);
   }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
   /// Overrides the internal iteration index
   CUTLASS_HOST_DEVICE
   void set_iteration_index(Index index) {
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
index afb015d352..573255da32 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -95,33 +95,7 @@ class Conv2dFpropActivationTileAccessIteratorOptimized {
   // Parameters structure
   //
 
-  struct Params : Conv2dFpropActivationIteratorOptimizedParams<Layout> {
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Conv2dFpropActivationIteratorOptimizedParams<Layout> const &base): 
-      Conv2dFpropActivationIteratorOptimizedParams<Layout>(base) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv2dProblemSize const &problem_size, 
-      Layout const &layout
-    ):
-      Conv2dFpropActivationIteratorOptimizedParams<Layout>(
-        problem_size,
-        layout,
-        sizeof_bits<Element>::value,
-        {Shape::kRow, Shape::kColumn},
-        ThreadMap::kThreads,
-        ThreadMap::kElementsPerAccess,
-        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
-      ) {
-
-    }  
-  };
+  using Params = Conv2dFpropActivationIteratorOptimizedParams<Layout>;
 
 private:
 
@@ -234,6 +208,18 @@ class Conv2dFpropActivationTileAccessIteratorOptimized {
     set_iteration_index(0);
   }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
 private:
 
   /// Returns the coordinate in the activations tensor X that is correspoinding to 
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
index 6547e9c5ba..b0a89adae2 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
index bf0d1d3124..2f12e41fef 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/threadblock/conv2d_params.h b/include/cutlass/conv/threadblock/conv2d_params.h
index ac6b2e3095..3c64b1f75e 100644
--- a/include/cutlass/conv/threadblock/conv2d_params.h
+++ b/include/cutlass/conv/threadblock/conv2d_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -68,7 +68,7 @@ struct Conv2dAnalyticParams {
 
   CUTLASS_HOST_DEVICE
   Conv2dAnalyticParams(
-    Conv2dProblemSize const &problem_size,
+    Conv2dProblemSize const &,  // unused; placeholder to match other Params interfaces.
     Layout const &layout
   ): layout(layout) {
 
@@ -168,7 +168,10 @@ struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNHWC> {
     layout::PitchLinearCoord threadmap_iterations,
     layout::PitchLinearCoord threadmap_delta
   ): 
-    layout(layout), PQ(problem_size.P * problem_size.Q), pq_divmod(PQ), q_divmod(problem_size.Q) {
+    layout(layout), 
+    PQ(problem_size.P * problem_size.Q), 
+    pq_divmod(PQ), 
+    q_divmod(problem_size.Q) {
 
     TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", 
       element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
@@ -176,7 +179,9 @@ struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNHWC> {
     int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
 
     // next S
-    inc_next[0] = conv_sign * (int64_t(layout.stride()[0]) * problem_size.dilation_w) * element_size_bits / 8;
+    inc_next[0] = conv_sign * (
+      int64_t(layout.stride()[0]) * problem_size.dilation_w
+    ) * element_size_bits / 8;
 
     // next R
     inc_next[1] = conv_sign * (
@@ -388,7 +393,7 @@ struct Conv2dDgradOutputGradientIteratorOptimizedParams {
 
   int filter_k_delta;     // number of logical elements to add to filter_k_
 
-  int HW;                   // product of H*W
+  int HW;                  // product of H*W
 
   FastDivmod hw_divmod;
   FastDivmod w_divmod;
@@ -411,7 +416,10 @@ struct Conv2dDgradOutputGradientIteratorOptimizedParams {
     layout::PitchLinearCoord threadmap_iterations,
     layout::PitchLinearCoord threadmap_delta
   ): 
-    layout(layout), HW(problem_size.H *problem_size.W), hw_divmod(HW), w_divmod(problem_size.W) {
+    layout(layout), 
+    HW(problem_size.H *problem_size.W), 
+    hw_divmod(HW), 
+    w_divmod(problem_size.W) {
     
     TRACE_CONV_INITIALIZERS("conv2d_dgrad", "output_gradient", 
       element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
@@ -419,7 +427,9 @@ struct Conv2dDgradOutputGradientIteratorOptimizedParams {
     int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
 
     // next S
-    inc_next[0] = conv_sign * (layout.stride()[0] * problem_size.dilation_w) * element_size_bits / 8;
+    inc_next[0] = conv_sign * (
+      layout.stride()[0] * problem_size.dilation_w
+    ) * element_size_bits / 8;
 
     // next R
     inc_next[1] = conv_sign * (
diff --git a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
index ce52017e37..61f02d19fe 100644
--- a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
+++ b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -92,6 +92,12 @@ class TileIterator {
   ):
     tile_access_iterator_(params, problem_size, ptr, thread_idx, threadblock_offset) { }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(ConvProblemSize const &problem_size, Layout const &layout) {
+    return TileAccessIterator::getParams(problem_size, layout);
+  }
+
+
   /// Adds a pointer offset in units of Element
   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
index 13d8338c2f..1e3a5837d0 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
index 74a887794b..7762d6191f 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
index 84c788d6d4..53fc920575 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -133,6 +133,11 @@ class Conv2dWgradOutputGradientTileAccessIteratorAnalytic {
     }
   }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
   /// Overrides the internal iteration index
   CUTLASS_HOST_DEVICE
   void set_iteration_index(Index index) {
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
index 4a20cb1d8b..f138ef59a4 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -86,35 +86,7 @@ class Conv2dWgradOutputGradientTileAccessIteratorOptimized {
   // Parameters structure
   //
 
-  struct Params : Conv2dWgradOutputGradientIteratorOptimizedParams {
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Conv2dWgradOutputGradientIteratorOptimizedParams const &base): 
-      Conv2dWgradOutputGradientIteratorOptimizedParams(base) { }
-      
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv2dProblemSize const &problem_size, 
-      Layout const &layout
-    ): 
-      Conv2dWgradOutputGradientIteratorOptimizedParams(
-        problem_size,
-        layout,
-        sizeof_bits<Element>::value,
-        {Shape::kRow, Shape::kColumn},
-        ThreadMap::kThreads,
-        ThreadMap::kElementsPerAccess,
-        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
-      ) { }
-  };
+  using Params = Conv2dWgradOutputGradientIteratorOptimizedParams;
 
 private:
 
@@ -176,6 +148,18 @@ class Conv2dWgradOutputGradientTileAccessIteratorOptimized {
     set_iteration_index(0);
   }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
   /// Overrides the internal iteration index
   CUTLASS_HOST_DEVICE
   void set_iteration_index(Index index) {
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
index 0033568278..01437547c6 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..ee532ff61e
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity
+>
+class Conv3dDgradFilterTileAccessIteratorOptimized {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = StrideSupport_;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv3dDgradFilterIteratorOptimizedParams {
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dDgradFilterIteratorOptimizedParams const &base): 
+      Conv3dDgradFilterIteratorOptimizedParams(base) { }
+      
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ):
+      Conv3dDgradFilterIteratorOptimizedParams(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) { }
+
+  };
+
+private:
+
+  Conv3dDgradFilterIteratorOptimizedParams const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  uint32_t predicates_;
+  int filter_trs_;
+  int filter_k_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided *
+    ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorOptimized(
+    Conv3dDgradFilterIteratorOptimizedParams const &params,
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_(0),
+    filter_trs_(0),
+    filter_k_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.strided();
+    Index column = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided;
+        int filter_c = column + c * ThreadMap::Delta::kContiguous;
+
+        uint32_t pred = ((filter_k < problem_size_.K && filter_c < problem_size_.C) ? 1u : 0);
+
+        int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+        
+        predicates_ |= (pred << pred_idx);
+      }
+    }
+
+    pointer_ += (
+      filter_k_ * params.layout.stride()[3] + column
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_trs;
+
+    // moves to the next tile
+    ++filter_trs_;
+    if (filter_trs_ == params_.TRS) {
+
+      filter_trs_ = 0;
+      next = params_.inc_next_k;
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
+        predicates_ = (predicates_ & (~kClearMask));
+      }
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_ & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_ + 
+      iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_strided;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
index 47e7de46a0..1d70ab3d57 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -212,6 +212,11 @@ class Conv3dDgradOutputGradientTileAccessIteratorAnalytic <
     }
   }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
   /// Overrides the internal iteration index
   CUTLASS_HOST_DEVICE
   void set_iteration_index(Index index) {
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..2a62c2924b
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,484 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity
+>
+class Conv3dDgradOutputGradientTileAccessIteratorOptimized {
+public:
+
+  static_assert(StrideSupport_ == conv::StrideSupport::kUnity,
+    "Only unit-stride dgrad is supported at this time.");
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  using Coord3D = Coord<3>;
+
+  using Mask = uint64_t;
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv3dDgradOutputGradientIteratorOptimizedParams;
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (t, r, s)
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_k_;
+
+  Index masks_[ThreadMap::Iterations::kStrided][3];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorOptimized(
+    Params const &params,
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    filter_k_(0), 
+    filter_t_(0),
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_d[ThreadMap::Iterations::kStrided];
+    int offset_h[ThreadMap::Iterations::kStrided];
+    int offset_w[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_ndhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_ndhw / (problem_size_.D * problem_size_.H * problem_size_.W);
+      //  int residual = offset_ndhw % (problem_size_.D * problem_size_.H * problem_size_.W);
+      //
+      //
+      //  offset_d[s] = residual / (problem_size_.H * problem_size_.W);
+      //  residual    = residual % (problem_size_.H * problem_size_.W);
+      //
+      //  offset_h[s] = residual / problem_size_.W;
+      //  offset_w[s] = residual % problem_size_.W;
+      //
+
+      int residual;
+
+      // input: (ndhw offset) output: (n offset and resudial (dhw offset))
+      params_.dhw_divmod(offset_n[s], residual, offset_ndhw);
+      // input: (dhw offset) output: (d offset and resudial (hw))
+      params_.hw_divmod(offset_d[s], residual, residual);
+      // input: (hw offset) output: (h offset and resudial (w offset))
+      params_.w_divmod(offset_h[s], offset_w[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_d[s], offset_h[s], offset_w[s], 0, 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int t = 0; t < problem_size_.T; ++t) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int t_ = t;
+        if (problem_size_.mode == Mode::kConvolution) {
+          t_ = problem_size_.T - 1 - t;
+        }
+
+        int z = offset_d[s_idx] + problem_size_.pad_d - t_ * problem_size_.dilation_d;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && z >= 0 && z < problem_size_.Z);
+        masks_[s_idx][0] |= (pred << t);
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int p = offset_h[s_idx] + problem_size_.pad_h - r_ * problem_size_.dilation_h;
+
+        bool pred = (p >= 0 && p < problem_size_.P);
+        masks_[s_idx][1] |= (pred << r);
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int q = offset_w[s_idx] + problem_size_.pad_w - s_ * problem_size_.dilation_w;
+
+        bool pred = (q >= 0 && q < problem_size_.Q);
+        masks_[s_idx][2] |= (pred << s);
+      }
+    }
+
+    if (filter_k_ >= problem_size.K) {
+      clear_mask();
+    }
+
+    set_iteration_index(0);
+
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+private:
+
+
+  /// Returns the coordinate in the output gradient tensor dy that is correspoinding to 
+  // activation ndhw and filter position k, t, r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int d, int h, int w, int t, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = problem_size_.T - 1 - t;
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int z = d + problem_size_.pad_d - t * problem_size_.dilation_d;
+    int p = h + problem_size_.pad_h - r * problem_size_.dilation_h;
+    int q = w + problem_size_.pad_w - s * problem_size_.dilation_w;
+
+    return TensorCoord(n, z, p, q, filter_k_);
+  }
+
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask_(bool clear) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      // We are using inline PTX assembly here to avoid an CUDA C++ compilation
+      // artifact in which control flow instructions are generated. Instead, our
+      // intent is to predicate the mov instructions.
+      #if defined(__CUDA_ARCH__)
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][0])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][0])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][1])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][1])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][2])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][2])
+      );
+      #else
+        if (clear) {
+          masks_[s][0] = 0;
+          masks_[s][1] = 0;
+          masks_[s][2] = 0;
+        }
+      #endif
+    }
+  }
+
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      
+      filter_s_ = 0;
+      ++filter_r_;
+      next_idx = 1;
+
+      if (filter_r_ == problem_size_.R) {
+        filter_r_ = 0;
+        ++filter_t_;
+
+        if (filter_t_ < problem_size_.T) {
+          next_idx = 2;
+        } 
+        else {
+          filter_t_ = 0;
+          next_idx = 3;
+        } 
+      }
+    }
+
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 3) {  
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    clear_mask_(filter_k_ >= problem_size_.K);
+
+  }
+
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][0] = Mask(0);
+      masks_[s][1] = Mask(0);
+      masks_[s][2] = Mask(0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][0] & (Index(1) << filter_t_)) &&
+      (masks_[iteration_strided_][1] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][2] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorOptimized &operator++() {
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // This is specialized for unit stride
+    if (problem_size.stride() != Coord3D({1, 1, 1})) {
+      return Status::kErrorNotSupported;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorNotSupported;
+    }
+
+    // Limit on filter size
+    if (problem_size.T > 32 || problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
index f5d14b5b10..7cadf860f7 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -46,6 +46,7 @@
 #include "cutlass/layout/matrix.h"
 #include "cutlass/conv/convolution.h"
 #include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -91,25 +92,7 @@ class Conv3dFpropActivationTileAccessIteratorAnalytic {
   // Parameters structure
   //
 
-  struct Params {
-
-    Layout layout;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ConvProblemSize const &problem_size, 
-      Layout const &layout
-    ): layout(layout) {
-
-    }
-  };
+  using Params = Conv3dAnalyticParams<Layout>;
 
 private:
 
@@ -168,6 +151,11 @@ class Conv3dFpropActivationTileAccessIteratorAnalytic {
     set_iteration_index(0);
   }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
   /// Overrides the internal iteration index
   CUTLASS_HOST_DEVICE
   void set_iteration_index(Index index) {
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..9246c59221
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,472 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_
+>
+class Conv3dFpropActivationTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  using Mask = uint64_t;
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv3dFpropActivationIteratorOptimizedParams<Layout>;
+
+private:
+
+  Conv3dFpropActivationIteratorOptimizedParams<Layout> const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (t, r, s)
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  // mask for t, r, and s
+  Index masks_[ThreadMap::Iterations::kStrided][3];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorOptimized(
+    Conv3dFpropActivationIteratorOptimizedParams<Layout> const &params,
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ) :    
+  params_(params), 
+  problem_size_(problem_size),
+  filter_t_(0), 
+  filter_r_(0), 
+  filter_s_(0),
+  filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_z[ThreadMap::Iterations::kStrided];
+    int offset_p[ThreadMap::Iterations::kStrided];
+    int offset_q[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_nzpq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+      //  int residual = offset_nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+      //
+      //  offset_z[s] = residual / (problem_size_.P * problem_size_.Q);
+      //  residual = residual % (problem_size_.P * problem_size_.Q);
+      //
+      //  offset_p[s] = residual / problem_size_.Q;
+      //  offset_q[s] = residual % problem_size_.Q;
+      //
+
+      int residual;
+
+      // input: (nzpq offset) output: (n offset and resudial (zpq offset))
+      params.zpq_divmod(offset_n[s], residual, offset_nzpq);
+      // input: (zpq offset) output: (z offset and resudial (pq))
+      params.pq_divmod(offset_z[s], residual, residual);
+      // input: (pq offset) output: (p offset and resudial (q offset))
+      params.q_divmod(offset_p[s], offset_q[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_z[s], offset_p[s], offset_q[s], 0, 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    // mask predicates for filter position T
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int t = 0; t < problem_size_.T; ++t) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int t_ = t;
+        if (problem_size_.mode == Mode::kConvolution) {
+          t_ = problem_size_.T - 1 - t;
+        }
+
+        int d = offset_z[s_idx] * problem_size_.stride_d - problem_size_.pad_d + t_ * problem_size_.dilation_d;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && d >= 0 && d < problem_size_.D);
+        masks_[s_idx][0] |= (pred << t);
+      }
+    }   
+
+    // mask predicates for filter position R
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int h = offset_p[s_idx] * problem_size_.stride_h - problem_size_.pad_h + r_ * problem_size_.dilation_h;
+
+        bool pred = (h >= 0 && h < problem_size_.H);
+        masks_[s_idx][1] |= (pred << r);
+      }
+    }  
+
+    // mask predicates for filter position S
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int w = offset_q[s_idx] * problem_size_.stride_w - problem_size_.pad_w + s_ * problem_size_.dilation_w;
+
+        bool pred = (w >= 0 && w < problem_size_.W);
+        masks_[s_idx][2] |= (pred << s);
+      }
+    }
+
+    if (filter_c_ >= problem_size.C) {
+      clear_mask();
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+private:
+
+  /// Returns the coordinate in the activations tensor X that is correspoinding to 
+  // output nzpq and filter position t, r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int z, int p, int q, int t, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = problem_size_.T - 1 - t;
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, d, h, w, filter_c_);
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask_(bool clear) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      // We are using inline PTX assembly here to avoid an CUDA C++ compilation
+      // artifact in which control flow instructions are generated. Instead, our
+      // intent is to predicate the mov instructions.
+      #if defined(__CUDA_ARCH__)
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][0])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][0])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][1])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][1])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][2])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][2])
+      );
+      #else
+        if (clear) {
+          masks_[s][0] = 0;
+          masks_[s][1] = 0;
+          masks_[s][2] = 0;
+        }
+      #endif
+    }
+  }
+
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+ 
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      
+      filter_s_ = 0;
+      ++filter_r_;
+      next_idx = 1;
+
+      if (filter_r_ == problem_size_.R) {
+        filter_r_ = 0;
+        ++filter_t_;
+
+        if (filter_t_ < problem_size_.T) {
+          next_idx = 2;
+        } 
+        else {
+          filter_t_ = 0;
+          next_idx = 3;
+        } 
+      }
+    }
+
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 3) {  
+      filter_c_ += params_.filter_c_delta;
+    }
+
+    clear_mask_(filter_c_ >= problem_size_.C);
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][0] = Mask(0);
+      masks_[s][1] = Mask(0);
+      masks_[s][2] = Mask(0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][0] & (Index(1) << filter_t_)) &&
+      (masks_[iteration_strided_][1] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][2] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorOptimized &operator++() {
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // Conv3dFpropActivationTileAccessIteratorOptimized has constraint on filter positions 
+    // due to the number of mask bits.
+    if (problem_size.T > 32 || problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
index bad6598baf..a7f543681b 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -45,6 +45,7 @@
 #include "cutlass/layout/matrix.h"
 #include "cutlass/conv/convolution.h"
 #include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -90,24 +91,7 @@ class Conv3dFpropFilterTileAccessIteratorAnalytic {
   // Parameters structure
   //
 
-  struct Params {
-
-    Layout layout;
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ConvProblemSize const &problem_size, 
-      Layout const &layout
-    ): layout(layout) {
-
-    }
-  };
+  using Params = Conv3dAnalyticParams<Layout>;
 
 private:
 
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..5d814890bd
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,270 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_
+>
+class Conv3dFpropFilterTileAccessIteratorOptimized{
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv3dFpropFilterIteratorOptimizedParams<Layout> {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dFpropFilterIteratorOptimizedParams<Layout> const &base): 
+      Conv3dFpropFilterIteratorOptimizedParams<Layout>(base) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size,
+      Layout const &layout
+    ):
+      Conv3dFpropFilterIteratorOptimizedParams<Layout>(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) {
+
+    }
+  };
+
+private:
+
+  Conv3dFpropFilterIteratorOptimizedParams<Layout> const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  uint32_t predicates_;
+  int filter_trs_;
+  int filter_c_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorOptimized(
+    Conv3dFpropFilterIteratorOptimizedParams<Layout> const &params,
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_(0),
+    filter_trs_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+    Index column = threadblock_offset.column() + thread_coord.strided();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < problem_size_.K) ? 1u : 0);
+      predicates_ |= (pred << s);
+    }
+
+    if (filter_c_ >= problem_size.C) {
+      predicates_ = 0u;
+    }
+
+    pointer_ += (
+      params_.layout({filter_c_, column}) 
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_trs;
+
+    // moves to the next tile
+    ++filter_trs_;
+    if (filter_trs_ == params_.TRS) {
+
+      filter_trs_ = 0;
+      next = params_.inc_next_c;
+      filter_c_ += params_.filter_c_delta;
+    }
+      
+    if (filter_c_ >= problem_size_.C) {
+      predicates_ = 0;
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return (predicates_ & (1u << iteration_strided_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_k;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/conv3d_params.h b/include/cutlass/conv/threadblock/conv3d_params.h
new file mode 100644
index 0000000000..c95b52d90e
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_params.h
@@ -0,0 +1,363 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file 
+  \brief Extracts the host-params objects into non-template code.
+*/
+
+#pragma once
+
+#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+#include <fstream>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Params structure used for all Conv3d analytic tile iterators
+template< typename Layout_ = layout::TensorNDHWC >
+struct Conv3dAnalyticParams {
+
+  using Layout = Layout_;
+
+  Layout layout;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv3dAnalyticParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dAnalyticParams(
+    Conv3dProblemSize const &,  // unused; placeholder to match other Params interfaces.
+    Layout const &layout
+  ): layout(layout) {
+
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv3dFpropActivationTileIteratorOptimized
+template< typename Layout_ = layout::TensorNDHWC >
+struct Conv3dFpropActivationIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv3dFpropActivationTileIteratorOptimized
+template<>
+struct Conv3dFpropActivationIteratorOptimizedParams<layout::TensorNDHWC> {
+  
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+
+  int64_t inc_next[4];    // {next S, next R, next T, next C}
+  int filter_c_delta;     // number of logical elements to add to filter_c_
+  int ZPQ;                // product of Z*P*Q
+  int PQ;                 // product of P*Q
+
+  FastDivmod zpq_divmod;
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), 
+    PQ(problem_size.P * problem_size.Q),
+    ZPQ(problem_size.Z * problem_size.P * problem_size.Q),  
+    zpq_divmod(ZPQ),
+    pq_divmod(PQ), 
+    q_divmod(problem_size.Q) {
+
+    TRACE_CONV_INITIALIZERS("conv3d_fprop", "activation", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+  
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      int64_t(layout.stride()[0]) * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[1]) * problem_size.dilation_h
+        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next T
+    inc_next[2] = conv_sign * (
+      int64_t(layout.stride()[2]) * problem_size.dilation_d
+      - (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+      - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next C
+    inc_next[3] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * int64_t(problem_size.T - 1) * layout.stride()[2] * problem_size.dilation_d
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template< typename Layout_ = layout::TensorNDHWC >
+struct Conv3dFpropFilterIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Conv3dFpropFilterIteratorOptimizedParams<layout::TensorNDHWC>
+{
+
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+  int TRS;
+  int filter_c_delta;
+
+  int64_t inc_next_k;         // offset in units of bytes to next K position
+  int64_t inc_next_trs;        // offset in units of bytes to next TRS position
+  int64_t inc_next_c;         // offset in units of bytes to next C position
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout) {
+    
+    TRACE_CONV_INITIALIZERS("conv3d_fprop", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    TRS = problem_size.T * problem_size.R * problem_size.S;
+
+    inc_next_k = (int64_t(layout.stride()[3]) * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_trs =
+      ( int64_t(layout.stride()[0])
+        - int64_t(layout.stride()[3]) * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
+      ) * element_size_bits / 8;
+
+    inc_next_c =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices
+        - int64_t(TRS - 1) * layout.stride()[0]
+        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[3]
+      ) * element_size_bits / 8;
+
+    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters object for Conv3d DGRAD OutputGradient (dy) iterator
+struct Conv3dDgradOutputGradientIteratorOptimizedParams {
+
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+
+  int64_t inc_next[4];    // {next S, next R, next T, next K}
+  int filter_k_delta;     // number of logical elements to add to filter_k_
+
+  FastDivmod dhw_divmod;
+  FastDivmod hw_divmod;
+  FastDivmod w_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), 
+    dhw_divmod(problem_size.D * problem_size.H * problem_size.W),
+    hw_divmod(problem_size.H * problem_size.W), 
+    w_divmod(problem_size.W) {
+
+    TRACE_CONV_INITIALIZERS("conv3d_dgrad", "output_gradient", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      int64_t(layout.stride()[0]) * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[1]) * problem_size.dilation_h
+        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next T
+    inc_next[2] = conv_sign * (
+      int64_t(layout.stride()[2]) * problem_size.dilation_d
+      - (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+      - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next K
+    inc_next[3] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * int64_t(problem_size.T - 1) * layout.stride()[2] * problem_size.dilation_d
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters object for Conv2d DGRAD Filter (w) iterator
+struct Conv3dDgradFilterIteratorOptimizedParams {
+
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+  int TRS;
+  int filter_k_delta;
+
+  int64_t inc_next_strided;   // offset in units of bytes to next K coordinate within tile
+  int64_t inc_next_trs;       // offset in units of bytes to next TRS position
+  int64_t inc_next_k;         // offset in units of bytes to next K position in subsequent tile
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), TRS(problem_size.T * problem_size.R * problem_size.S) {
+
+    TRACE_CONV_INITIALIZERS("conv3d_dgrad", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    inc_next_strided = (layout.stride()[3] * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_trs =
+      ( layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[3]
+      ) * element_size_bits / 8;
+
+    inc_next_k =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices * layout.stride()[3]
+        - (problem_size.T * problem_size.R * problem_size.S - 1) * layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[3]
+      ) * element_size_bits / 8;
+
+    filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
index 0ad49abd31..396d856a13 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
index 35c4643052..2835480d80 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
index 74017c09f6..b8af8efa44 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -151,6 +151,11 @@ class Conv3dWgradOutputGradientTileAccessIteratorAnalytic {
     }
   }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
   /// Overrides the internal iteration index
   CUTLASS_HOST_DEVICE
   void set_iteration_index(Index index) {
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
index 2cab09d1f3..d3b356e07d 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -203,6 +203,11 @@ class Conv3dWgradOutputGradientTileAccessIteratorOptimized {
     set_iteration_index(0);
   }
 
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
   /// Overrides the internal iteration index
   CUTLASS_HOST_DEVICE
   void set_iteration_index(Index index) {
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
index 1702847c10..aefdcd6db6 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
index 0d56ab6b3f..3d2062d536 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/coord.h b/include/cutlass/coord.h
index 181e3116e8..7c7aaf3a0b 100644
--- a/include/cutlass/coord.h
+++ b/include/cutlass/coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h
index bd69a707d3..b25806a33c 100644
--- a/include/cutlass/core_io.h
+++ b/include/cutlass/core_io.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h
index 622f037b40..5a70398026 100644
--- a/include/cutlass/cutlass.h
+++ b/include/cutlass/cutlass.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -31,6 +31,16 @@
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+#define CUTLASS_UNUSED(expr) do { (void)(expr); } while (0)
+
+#if defined(_MSC_VER)
+  #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__)
+#else
+  #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 namespace cutlass {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -43,6 +53,7 @@ namespace cutlass {
 #define CUTLASS_DEVICE __forceinline__ __device__
 #else
 #define CUTLASS_HOST_DEVICE inline
+#define CUTLASS_DEVICE inline
 #endif
 
 /// Status code returned by CUTLASS operations
diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h
index f5166ab16a..733e7b271b 100644
--- a/include/cutlass/device_kernel.h
+++ b/include/cutlass/device_kernel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index d352ea5a64..49a63335b6 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -45,16 +45,33 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <typename T>
+struct Identity {
+  CUTLASS_HOST_DEVICE
+  T operator()(T value) const {
+    return value;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// ReLu operator - propagates NaNs
 template <typename T>
 struct ReLu {
   CUTLASS_HOST_DEVICE
-  T operator()(T const & threshold, T const &value) const {
+  T operator()(T const & threshold, T value) const {
     if (value < threshold) {
       value = threshold;
     }
     return value;
   }
+  CUTLASS_HOST_DEVICE
+  T operator()(T value) const {
+    if (value < T()) {
+      value = T();
+    }
+    return value;
+  }
 };
 
 template <typename T, int N>
@@ -107,6 +124,15 @@ struct Sigmoid<Array<T, N> > {
   }
 };
 
+//
+// GELU function definitions implemented as described by
+//   Hendrycks, D., and Gimpel, K. in
+//   "Gaussian Error Linear Units (GELUs)." (2020)
+//   https://arxiv.org/pdf/1606.08415.pdf
+//
+// Floating-point constants are Taylor coefficients described in the paper.
+//
+
 // GELU operator
 template <typename T>
 struct GELU {
@@ -134,7 +160,7 @@ struct GELU<Array<T, N> > {
     GELU<T> gelu_op;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(rhs.size()); ++i) {
+    for (int i = 0; i < N; ++i) {
       y[i] = gelu_op(rhs[i]);
     }
 
@@ -142,6 +168,72 @@ struct GELU<Array<T, N> > {
   }
 };
 
+// GELU operator implemented using the Taylor series approximation
+template <typename T>
+struct GELU_taylor {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &z) const {
+
+    T k0 = T(0.7978845608028654);
+    T k1 = T(0.044715);
+
+    return T(cutlass::constants::half<T>() * z * 
+      (cutlass::constants::one<T>() + fast_tanh(k0 * z * (cutlass::constants::one<T>() + k1 * z * z))));
+  }
+};
+
+template <typename T, int N>
+struct GELU_taylor<Array<T, N> > {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &rhs) const {
+    Array<T, N> y;
+    GELU_taylor<T> gelu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = gelu_op(rhs[i]);
+    }
+
+    return y;
+  }
+};
+
+/// Computes backwards pass for GELU operator assuming d_t is the layer gradient and
+/// z is computed from the forward pass.
+template <typename T>
+struct dGELU {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &d_t, T const &z) const {
+
+    T k0 = T(0.7978845608028654);
+    T k1 = T(0.044715);
+    T k2 = T(0.1070322243);
+
+    T tanh_out = fast_tanh(k0 * z * (1 + k1 * z * z));
+
+    T ff = constants::half<T>() * z * ((1 - tanh_out * tanh_out) * (k0 + k2 * z * z)) + 
+      constants::half<T>() * (1 + tanh_out);
+
+    return ff * d_t;
+  }
+};
+
+template <typename T, int N>
+struct dGELU<Array<T, N> > {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &d_t, Array<T, N> const &z) const {
+    Array<T, N> y;
+    dGELU<T> gelu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = gelu_op(d_t[i], z[i]);
+    }
+
+    return y;
+  }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace thread
diff --git a/include/cutlass/epilogue/thread/conversion_op.h b/include/cutlass/epilogue/thread/conversion_op.h
index ad17d41490..7cdf6cb0d0 100644
--- a/include/cutlass/epilogue/thread/conversion_op.h
+++ b/include/cutlass/epilogue/thread/conversion_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/thread/linear_combination.h b/include/cutlass/epilogue/thread/linear_combination.h
index 4fff764fe5..fa2f72ac14 100644
--- a/include/cutlass/epilogue/thread/linear_combination.h
+++ b/include/cutlass/epilogue/thread/linear_combination.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -33,6 +33,7 @@
 #include "cutlass/array.h"
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -51,6 +52,7 @@ template <
   int Count,                                           ///< Number of elements computed per operation
   typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
   typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
 class LinearCombination {
@@ -140,6 +142,10 @@ class LinearCombination {
   /// Returns true if source is needed
   CUTLASS_HOST_DEVICE
   bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
     return beta_ != ElementCompute(0);
   }
 
@@ -208,3 +214,5 @@ class LinearCombination {
 } // namespace thread
 } // namespace epilogue
 } // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/thread/linear_combination_bias_relu.h b/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
new file mode 100644
index 0000000000..8c898f9074
--- /dev/null
+++ b/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
@@ -0,0 +1,265 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This is a partial specialization for fused Bias and ReLU. It supports the option of packing
+/// ReLU conditionals in a bit vector that may be used by backwards passes as an optimization.
+///
+/// This class can only be used with cutlass::epilogue::threadblock::EpilogueWithBroadcast<>.
+///
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  int ElementsPerAccess,
+  bool StoreT = true
+>
+class LinearCombinationBiasRelu {
+public:
+
+  using ElementOutput = ElementC_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementZ = ElementZ_;
+
+  using ElementT = uint1b_t;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  using ElementwiseOp = ReLu<ElementCompute>;
+  using BinaryOp = plus<ElementCompute>;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = true;
+
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute threshold;              ///< ReLu threshold
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute()), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr),
+      threshold(ElementCompute()) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold = ElementCompute()
+    ): 
+      alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr), threshold(threshold) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr), threshold(threshold) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold = ElementCompute()
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold(threshold) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr), threshold(threshold) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationBiasRelu(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z, 
+    FragmentT &frag_T, 
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const &V) const {
+
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    bool conditions[kElementsPerAccess];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i] + beta_ * tmp_C[i], V[i]);
+
+      bool condition = !(z < threshold_);
+      z = fmax(z, threshold_);
+
+      result_Z[i] = z;
+      conditions[i] = condition;
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if (kStoreT) {
+      PackPredicates<kElementsPerAccess> pack_predicates;
+      frag_T = pack_predicates(conditions); 
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z, 
+    FragmentT &frag_T, 
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V) const {
+
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    bool conditions[kElementsPerAccess];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
+
+      bool condition = !(z < threshold_);
+      z = fmax(z, threshold_);
+
+      result_Z[i] = z;
+      conditions[i] = condition;
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if (kStoreT) {
+      PackPredicates<kElementsPerAccess> pack_predicates;
+      frag_T = pack_predicates(conditions);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h
index 62a6ea7872..b2231bf767 100644
--- a/include/cutlass/epilogue/thread/linear_combination_clamp.h
+++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -53,6 +53,7 @@ template <
   int Count,                                           ///< Number of elements computed per operation
   typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
   typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
 class LinearCombinationClamp {
@@ -97,6 +98,13 @@ class LinearCombinationClamp {
 
     }
 
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute const *alpha_ptr,
@@ -104,6 +112,13 @@ class LinearCombinationClamp {
     ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
 
     }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+
+    }
   };
 
 private:
@@ -128,6 +143,10 @@ class LinearCombinationClamp {
   /// Returns true if source is needed
   CUTLASS_HOST_DEVICE
   bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
     return beta_ != ElementCompute(0);
   }
 
@@ -227,9 +246,10 @@ class LinearCombinationClamp {
 template <
   typename ElementOutput_,                             ///< Data type used to load and store tensors
   int Count,                                           ///< Number of elements computed per operation
+  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
   FloatRoundStyle Round
 >
-class LinearCombinationClamp<ElementOutput_, Count, int, float, Round> {
+class LinearCombinationClamp<ElementOutput_, Count, int, float, Scale, Round> {
 public:
 
   using ElementOutput = ElementOutput_;
@@ -283,6 +303,13 @@ class LinearCombinationClamp<ElementOutput_, Count, int, float, Round> {
 
     }
 
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute const *alpha_ptr,
@@ -290,6 +317,13 @@ class LinearCombinationClamp<ElementOutput_, Count, int, float, Round> {
     ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
 
     }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+
+    }
   };
 
 private:
@@ -314,6 +348,10 @@ class LinearCombinationClamp<ElementOutput_, Count, int, float, Round> {
   /// Returns true if source is needed
   CUTLASS_HOST_DEVICE
   bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
     return beta_ != ElementCompute(0);
   }
 
@@ -413,6 +451,8 @@ template <
     typename ElementOutput_,
     /// Number of elements computed per operation
     int Count,
+    ///< Control Alpha and Beta scaling
+    ScaleType::Kind Scale = ScaleType::Default,
     /// Rounding mode
     FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
 class FastLinearCombinationClamp {
@@ -467,9 +507,17 @@ class FastLinearCombinationClamp {
     Params(ElementCompute alpha, ElementCompute beta)
         : alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {}
 
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha)
+        : alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {}
+
     CUTLASS_HOST_DEVICE
     Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
         : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {}
   };
 
  private:
@@ -491,7 +539,13 @@ class FastLinearCombinationClamp {
 
   /// Returns true if source is needed
   CUTLASS_HOST_DEVICE
-  bool is_source_needed() const { return beta_ != ElementCompute(0); }
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    return beta_ != ElementCompute(0);
+  }
 
   /// Functionally required for serial reduction in the epilogue
   CUTLASS_HOST_DEVICE
diff --git a/include/cutlass/epilogue/thread/linear_combination_gelu.h b/include/cutlass/epilogue/thread/linear_combination_gelu.h
index 30b6213478..c47e89f10f 100644
--- a/include/cutlass/epilogue/thread/linear_combination_gelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_gelu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
index 68f334bdb8..8ecaab65ff 100644
--- a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
+++ b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/thread/linear_combination_relu.h b/include/cutlass/epilogue/thread/linear_combination_relu.h
index 7a41404791..d545a78a0f 100644
--- a/include/cutlass/epilogue/thread/linear_combination_relu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_relu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -35,6 +35,7 @@
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -53,6 +54,7 @@ template <
   int Count,                                           ///< Number of elements computed per operation
   typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
   typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
 class LinearCombinationRelu {
@@ -93,7 +95,7 @@ class LinearCombinationRelu {
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
-      ElementCompute beta,
+      ElementCompute beta = ElementCompute(0),
       ElementCompute threshold = ElementCompute(0)
     ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
 
@@ -102,7 +104,7 @@ class LinearCombinationRelu {
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr,
+      ElementCompute const *beta_ptr = nullptr,
       ElementCompute threshold = ElementCompute(0)
     ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
 
@@ -133,6 +135,10 @@ class LinearCombinationRelu {
   /// Returns true if source is needed
   CUTLASS_HOST_DEVICE
   bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
     return beta_ != ElementCompute(0);
   }
 
@@ -170,7 +176,11 @@ class LinearCombinationRelu {
     multiply_add<ComputeFragment> mul_add_accumulator;
     ReLu<ComputeFragment> relu;
 
-    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    if (Scale == ScaleType::NoBetaScaling)
+      intermediate = converted_source;
+    else
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+
     intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
 
     // Compute threshold optionally
@@ -224,9 +234,10 @@ class LinearCombinationRelu {
 template <
   typename ElementOutput_,                             ///< Data type used to load and store tensors
   int Count,                                           ///< Number of elements computed per operation
+  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
   FloatRoundStyle Round
 >
-class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
+class LinearCombinationRelu <ElementOutput_, Count, int, float, Scale, Round> {
 public:
 
   using ElementOutput = ElementOutput_;
@@ -264,7 +275,7 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
-      ElementCompute beta,
+      ElementCompute beta = ElementCompute(0),
       ElementCompute threshold = ElementCompute(0)
     ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
 
@@ -273,7 +284,7 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr,
+      ElementCompute const *beta_ptr = nullptr,
       ElementCompute threshold = ElementCompute(0)
     ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
 
@@ -304,6 +315,10 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
   /// Returns true if source is needed
   CUTLASS_HOST_DEVICE
   bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
     return beta_ != ElementCompute(0);
   }
 
@@ -341,8 +356,10 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
     multiply_add<ComputeFragment> mul_add_accumulator;
     ReLu<ComputeFragment> relu;
 
-    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    if (Scale == ScaleType::NoBetaScaling)
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
 
     // Compute threshold optionally
     intermediate = relu(threshold_, intermediate);
diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
index dbefd2258c..cea2d7a880 100644
--- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
+++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/thread/reduction_op.h b/include/cutlass/epilogue/thread/reduction_op.h
index 0331f0fad5..7078500fef 100644
--- a/include/cutlass/epilogue/thread/reduction_op.h
+++ b/include/cutlass/epilogue/thread/reduction_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/thread/scale_type.h b/include/cutlass/epilogue/thread/scale_type.h
new file mode 100644
index 0000000000..200db83a12
--- /dev/null
+++ b/include/cutlass/epilogue/thread/scale_type.h
@@ -0,0 +1,54 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Enum defines the behaviors of the epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specifies internal data type for computation
+struct ScaleType {
+  enum Kind {
+    Default,         // alpha x C + beta x D
+    NoBetaScaling,   // alpha x C + D
+    OnlyAlphaScaling // alpha x C
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
index 5c12f21680..84db8e131e 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -39,6 +39,11 @@
 #include "cutlass/gemm/gemm.h"
 
 #include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
 #include "cutlass/epilogue/thread/conversion_op.h"
 #include "cutlass/epilogue/thread/reduction_op.h"
 
@@ -88,6 +93,7 @@ struct DefaultEpilogueComplexTensorOp {
   using OutputOp = OutputOp_;
   static int const kElementsPerAccess = ElementsPerAccess;
   using Operator = Operator_;
+
   using ElementOutput = typename OutputOp::ElementOutput;
   using LayoutC = typename WarpMmaTensorOp::LayoutC;
   using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
@@ -173,6 +179,7 @@ struct DefaultEpilogueComplexTensorOp <Shape_, WarpMmaTensorOp_, PartitionsK,
   using OutputOp = OutputOp_;
   static int const kElementsPerAccess = ElementsPerAccess;
   using Operator = arch::OpMultiplyAddGaussianComplex;
+
   using ElementOutput = typename OutputOp::ElementOutput;
   using LayoutC = typename WarpMmaTensorOp::LayoutC;
   using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
index bb2fdb6b8c..0476094c8c 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
index 00bf26d35b..3420cec73d 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -40,6 +40,11 @@
 
 #include "cutlass/epilogue/thread/linear_combination.h"
 #include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
 #include "cutlass/epilogue/thread/conversion_op.h"
 #include "cutlass/epilogue/thread/reduction_op.h"
 
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
index 08b829be1d..5538fa8a10 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -40,6 +40,11 @@
 
 #include "cutlass/epilogue/thread/linear_combination.h"
 #include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
 #include "cutlass/epilogue/thread/conversion_op.h"
 #include "cutlass/epilogue/thread/reduction_op.h"
 
@@ -89,6 +94,32 @@ struct DefaultIteratorsTensorOp {
     ThreadMap,
     ElementAccumulator
   >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= float x 4
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  static int const kFragmentsPerIteration = 2;
 };
 
 /// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
@@ -125,6 +156,8 @@ struct DefaultIteratorsTensorOp<
     8,
     8
   >;
+
+  static int const kFragmentsPerIteration = 2;
 };
 
 /// Partial specialization for int8_t x 16 <= int32_t x 16 epilogues avoids shared memory bank conflicts.
@@ -160,6 +193,8 @@ struct DefaultIteratorsTensorOp<
     16,
     8
   >;
+
+  static int const kFragmentsPerIteration = 1;
 };
 
 /// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts.
@@ -195,6 +230,8 @@ struct DefaultIteratorsTensorOp<
     8,
     8
   >;
+
+  static int const kFragmentsPerIteration = 1;
 };
 
 /// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts.
@@ -230,6 +267,8 @@ struct DefaultIteratorsTensorOp<
     8,
     8
   >;
+
+  static int const kFragmentsPerIteration = 1;
 };
 
 } // namespace detail
@@ -251,6 +290,7 @@ struct DefaultEpilogueTensorOp {
   static int const kPartitionsK = PartitionsK;
   using OutputOp = OutputOp_;
   static int const kElementsPerAccess = ElementsPerAccess;
+
   using ElementOutput = typename OutputOp::ElementOutput;
   using LayoutC = typename WarpMmaTensorOp::LayoutC;
   using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
@@ -303,6 +343,8 @@ struct DefaultEpilogueTensorOp {
   /// Hard-coded padding elements added 
   using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
 
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
   //
   // Define the epilogue
   //
@@ -315,7 +357,8 @@ struct DefaultEpilogueTensorOp {
     WarpTileIterator,
     SharedLoadIterator,
     OutputOp,
-    Padding
+    Padding,
+    kFragmentsPerIteration
   >;
 };
 
@@ -325,7 +368,7 @@ struct DefaultEpilogueTensorOp {
 /// intereleaved output layout. For this case, shared memory is not needed.
 template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
           typename OutputOp_, int ElementsPerAccess, int InterleavedK,
-          bool IsBetaZero = false, bool isSplitK = false>
+          bool isSplitK = false>
 struct DefaultInterleavedEpilogueTensorOp {
   using Shape = Shape_;
   using WarpMmaTensorOp = WarpMmaTensorOp_;
@@ -362,7 +405,7 @@ struct DefaultInterleavedEpilogueTensorOp {
   //
   using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
       Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
-      AccumulatorFragmentIterator, OutputOp, InterleavedK, IsBetaZero>;
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -371,7 +414,7 @@ struct DefaultInterleavedEpilogueTensorOp {
 /// intereleaved output layout. For this case, shared memory is not needed.
 template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
           typename OutputOp_, int ElementsPerAccess, int InterleavedK,
-          bool IsBetaZero = false, bool isSplitK = false>
+          bool isSplitK = false>
 struct DefaultInterleavedConvEpilogue {
   using Shape = Shape_;
   using WarpMmaTensorOp = WarpMmaTensorOp_;
@@ -408,7 +451,7 @@ struct DefaultInterleavedConvEpilogue {
   //
   using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
       Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
-      AccumulatorFragmentIterator, OutputOp, InterleavedK, IsBetaZero>;
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
index 7fec5110f4..4dbd339fd9 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -40,6 +40,11 @@
 
 #include "cutlass/epilogue/thread/linear_combination.h"
 #include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
 #include "cutlass/epilogue/thread/conversion_op.h"
 #include "cutlass/epilogue/thread/reduction_op.h"
 
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
index 58425c286c..353b0f5478 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -40,6 +40,11 @@
 
 #include "cutlass/epilogue/thread/linear_combination.h"
 #include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
 #include "cutlass/epilogue/thread/conversion_op.h"
 #include "cutlass/epilogue/thread/reduction_op.h"
 
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
index 69298d515a..0f33ad9a41 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
index 752b1ee9b4..901b16845f 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
index 9776ba0682..f9f77c2223 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
index cd828c697e..ccde4a526c 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h
index f14be1ff8e..8f9dd454be 100644
--- a/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/epilogue.h b/include/cutlass/epilogue/threadblock/epilogue.h
index a27541b47a..9afd3d5f7f 100644
--- a/include/cutlass/epilogue/threadblock/epilogue.h
+++ b/include/cutlass/epilogue/threadblock/epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -63,7 +63,7 @@ namespace threadblock {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Epilogue operator without splitk
+/// Epilogue operator
 template <
   typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
   typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
@@ -73,7 +73,8 @@ template <
   typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
   typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
   typename OutputOp_,                       ///< Output operator
-  typename Padding_                         ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1                 ///< Used to coarsten the epilogue granularity
 >
 class Epilogue : 
   public EpilogueBase<
@@ -82,7 +83,8 @@ class Epilogue :
     PartitionsK, 
     AccumulatorFragmentIterator_, 
     WarpTileIterator_, 
-    Padding_> {
+    Padding_,
+    FragmentsPerPartition> {
 
 public:
 
@@ -92,7 +94,8 @@ class Epilogue :
     PartitionsK, 
     AccumulatorFragmentIterator_, 
     WarpTileIterator_, 
-    Padding_>;
+    Padding_,
+    FragmentsPerPartition>;
 
   using Shape = Shape_;
   using WarpMmaOperator = WarpMmaOperator_;
@@ -113,7 +116,6 @@ class Epilogue :
   /// Accumulator element
   using ElementAccumulator = typename WarpTileIterator::Element;
 
-
   /// Output element
   using ElementOutput = typename OutputTileIterator::Element;
 
@@ -139,6 +141,9 @@ class Epilogue :
   /// Number of warps
   using WarpCount = typename Base::WarpCount;
 
+  int const kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  int const kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
 public:
 
 
@@ -166,7 +171,10 @@ class Epilogue :
     int lane_idx                     ///< Id of thread within warp
   ):
     Base(shared_storage, thread_idx, warp_idx, lane_idx),
-    shared_load_iterator_(shared_storage.reference(), thread_idx) { }
+    shared_load_iterator_(shared_storage.reference(), thread_idx) 
+  {
+    
+  }
 
   /// Streams the result to global memory
   CUTLASS_DEVICE
@@ -177,7 +185,7 @@ class Epilogue :
     OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
     
     if (!output_op.is_source_needed()) {
-      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);  
     }
     else {
       compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
@@ -185,6 +193,8 @@ class Epilogue :
   }
 
 private:
+
+  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
   
   /// Streams the result to global memory
   CUTLASS_DEVICE
@@ -205,7 +215,7 @@ class Epilogue :
     // 
 
     CUTLASS_PRAGMA_UNROLL
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
 
       //
       // Convert and store fragment
@@ -213,12 +223,24 @@ class Epilogue :
       
       __syncthreads();
 
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
 
-      accum_fragment_iterator.load(accum_fragment);
-      ++accum_fragment_iterator;
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
 
-      this->warp_tile_iterator_.store(accum_fragment);
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        this->warp_tile_iterator_.store(accum_fragment); 
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
 
       __syncthreads();
 
@@ -226,45 +248,53 @@ class Epilogue :
       // Load fragments from shared memory
       //
 
-      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
 
-      shared_load_iterator_.load(aligned_accum_fragment[0]);
 
-      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-      if (kPartitionsK > 1)
-      {
-        plus <typename SharedLoadIterator::Fragment> add_fragments;
-        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
 
-        CUTLASS_PRAGMA_UNROLL
-        for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
-          shared_load_iterator_.load(aligned_accum_fragment[i]);
-          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
         }
+        else if (kPartitionsK > 1) {
 
-        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
-      }
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
 
-      //
-      // Compute the output result
-      //
-     
-      typename OutputTileIterator::Fragment output_fragment;
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
 
-      apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment[0]);
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
 
+        //
+        // Compute the output result
+        //
 
-      //
-      // Store the final result
-      //
+        typename OutputTileIterator::Fragment output_fragment;
 
-      destination_iterator.store(output_fragment);
-      ++destination_iterator;
- 
+        apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment[0]);
+
+
+        //
+        // Store the final result
+        //
+
+        destination_iterator.store(output_fragment);
+        ++destination_iterator;
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
     }
   }
-
   
   /// Streams the result to global memory
   CUTLASS_DEVICE
@@ -323,19 +353,18 @@ class Epilogue :
       shared_load_iterator_.load(aligned_accum_fragment[0]);
 
       // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-      if (kPartitionsK > 1)
-      {
+      if (kPartitionsK > 1) {
+
         plus <typename SharedLoadIterator::Fragment> add_fragments;
-        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
 
         CUTLASS_PRAGMA_UNROLL
         for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
           shared_load_iterator_.load(aligned_accum_fragment[i]);
           aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
         }
 
-        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+        shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
       }
 
       //
diff --git a/include/cutlass/epilogue/threadblock/epilogue_base.h b/include/cutlass/epilogue/threadblock/epilogue_base.h
index a9b5a41404..76692d43cd 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_base.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -66,7 +66,8 @@ template <
   int PartitionsK,                          ///< Number of partitions of the K dimension
   typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
   typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
-  typename Padding_                         ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerIteration = 1
 >
 class EpilogueBase {
 public:
@@ -94,6 +95,9 @@ class EpilogueBase {
     kPartitionsK
   >;
 
+  /// Use this to control the granularity of one epilogue 'iteration'
+  static int const kFragmentsPerIteration = FragmentsPerIteration;
+
 public:
 
   /// Shared storage allocation needed by the epilogue
@@ -120,7 +124,7 @@ class EpilogueBase {
 
     /// Shape of the shared memory allocation for the epilogue    
     using StorageShape = MatrixShape<
-      Shape::kRow + Padding::kRow, 
+      (Shape::kRow + Padding::kRow) * kFragmentsPerIteration, 
       Shape::kColumn + Padding::kColumn
     >;
 
diff --git a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
index 6cb9963615..eae1ad4ff5 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/epilogue_workspace.h b/include/cutlass/epilogue/threadblock/epilogue_workspace.h
index 36d196a37f..2341051c87 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_workspace.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_workspace.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
index b616545b9f..7bf7b4de8e 100644
--- a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
+++ b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -73,9 +73,7 @@ template <
     /// Output operator
     typename OutputOp_,
     /// Number of interleaved k
-    int InterleavedK,
-    /// Whether Beta is zero
-    bool IsBetaZero = false>
+    int InterleavedK>
 class InterleavedEpilogue {
  public:
   using Shape = Shape_;
@@ -149,21 +147,75 @@ class InterleavedEpilogue {
     OutputTileIterator destination_iterator,      ///< Tile iterator for destination
     AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
     OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);  
+    }
+    else {
+      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+   
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators           ///< Complete warp-level accumulator tile
+    ) { 
 
     //
-    // Predicated tile iterators constructed from members
+    // Iterator over warp-level accumulator fragment
     //
 
-    if (IsBetaZero && output_op.is_source_needed())
-      assert(0);
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
 
-    typename OutputTileIterator::Fragment source_fragment;
+    //
+    // Iterate over accumulator tile
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Convert fragment
+      //
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+      apply_output_operator_source_not_needed_(output_op, output_fragment, accum_fragment);
+
+      //
+      // Store the final result
+      //
 
-    if (!IsBetaZero) {
-      if (!output_op.is_source_needed()) {
-        source_iterator.clear_mask();
-      }
+      destination_iterator.set_iteration_index(iter);
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
     }
+  } 
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator           ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    ) { 
+ 
+    //
+    // Predicated tile iterators constructed from members
+    //
+
+    typename OutputTileIterator::Fragment source_fragment;
 
     source_fragment.clear();
 
@@ -183,11 +235,9 @@ class InterleavedEpilogue {
       // Load the source
       //
 
-      if (!IsBetaZero) {
-        source_iterator.set_iteration_index(iter);
-        source_iterator.load(source_fragment);
-        ++source_iterator;
-      }
+      source_iterator.set_iteration_index(iter);
+      source_iterator.load(source_fragment);
+      ++source_iterator;
 
       //
       // Convert fragment
@@ -243,6 +293,30 @@ class InterleavedEpilogue {
       output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
     }
   }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+      typename OutputTileIterator::Fragment &output_fragment,
+      typename AccumulatorFragmentIterator::Fragment const
+          &aligned_accum_fragment) {
+    OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(
+            &aligned_accum_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+                                    OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+    }
+  }
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
index cfe13cc167..377f33bd95 100644
--- a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
+++ b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -371,6 +371,11 @@ struct OutputTileOptimalThreadMap {
 
     using Shape = Shape_;
 
+    using TileShape = MatrixShape<
+      Shape::kTile * Shape::kCluster * Shape::kGroup * Shape::kRow,
+      Shape::kColumn
+    >;
+
     using Iterations = OutputTileShape<
       Detail::RowArrangement::kIterationsColumn,
       Detail::RowArrangement::kIterationsRow,
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
index 1be50cbd90..a4a5d15a12 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -176,7 +176,7 @@ class PredicatedTileIterator {
 
   /// Internal state counter
   int state_[3];
-
+ 
 private:
 
   //
@@ -214,6 +214,11 @@ class PredicatedTileIterator {
         + ThreadMap::Delta::kColumn * c) < extent.column());
     }
 
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
     // Initialize pointer
     byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) + 
       LongIndex(thread_offset.row()) * LongIndex(params_.stride) + 
@@ -288,11 +293,12 @@ class PredicatedTileIterator {
     }
   }
 
+
   /// Loads a fragment from memory
   CUTLASS_DEVICE
   void load(Fragment &frag) {
-      load_with_byte_offset(frag, 0);
 
+    load_with_byte_offset(frag, 0);
   }
 
   /// Stores a fragment to memory
@@ -326,11 +332,10 @@ class PredicatedTileIterator {
 
             bool guard = row_guard && mask_.predicates[column];
 
-            if (guard) {
-              
-              memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
-            }
+            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
           }
 
           if (row + 1 < ThreadMap::Iterations::kRow) {
@@ -349,11 +354,12 @@ class PredicatedTileIterator {
     }
   }
 
+
   /// Stores a fragment to memory
   CUTLASS_DEVICE
   void store(Fragment const &frag) {
-      store_with_byte_offset(frag, 0);
 
+    store_with_byte_offset(frag, 0);
   }
 
   /// Advances to the next position to load or store
@@ -404,7 +410,7 @@ class PredicatedTileIterator {
 
   ///< Sets the mask
   CUTLASS_DEVICE void get_mask(Mask &mask) {
-    return mask_;
+    mask = mask_;
   }
 
   ///< Sets the mask
@@ -644,9 +650,8 @@ class InterleavedPredicatedTileIterator {
 
     bool guard = col_guard && mask_.predicates[iteration_contiguous_];
 
-    if (guard) {
-      *memory_pointer = *frag_ptr;
-    }
+    cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+        *frag_ptr, (void *)memory_pointer, guard);
   }
 
   /// Overrides the internal iteration index
@@ -689,7 +694,7 @@ class InterleavedPredicatedTileIterator {
 
   ///< Sets the mask
   CUTLASS_DEVICE void get_mask(Mask &mask) {
-    return mask_;
+    mask = mask_;
   }
 
   ///< Sets the mask
@@ -949,9 +954,8 @@ class InterleavedConvPredicatedTileIterator {
     AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
     AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);
 
-    if (guard) {
-      *memory_pointer = *frag_ptr;
-    }
+    cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+        *frag_ptr, (void *)memory_pointer, guard);
   }
 
   /// Overrides the internal iteration index
@@ -993,7 +997,7 @@ class InterleavedConvPredicatedTileIterator {
 
   ///< Sets the mask
   CUTLASS_DEVICE void get_mask(Mask &mask) {
-    return mask_;
+    mask = mask_;
   }
 
   ///< Sets the mask
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
index a08e1e0616..d73ce1bdfa 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator.h b/include/cutlass/epilogue/threadblock/shared_load_iterator.h
index 0aa3dbb19d..b5fefa26db 100644
--- a/include/cutlass/epilogue/threadblock/shared_load_iterator.h
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -61,7 +61,7 @@ template <
 class SharedLoadIterator {
 public:
   using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
+  using Shape = typename ThreadMap::TileShape;
 
   using Element = Element_;
 
@@ -151,7 +151,9 @@ class SharedLoadIterator {
 
   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &offset) {
-    add_pointer_offset(offset.row() * stride_ / (sizeof_bits<Element>::value / 8) + offset.column() * Shape::kColumn);
+    byte_pointer_ += 
+      offset.row() * Shape::kRow * stride_ + 
+      offset.column() * Shape::kColumn * sizeof_bits<Element>::value / 8;
   }
 
   /// Loads a fragment from memory
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
index d37b07d562..5b31e33727 100644
--- a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -171,7 +171,7 @@ class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8> {
   void add_pointer_offset(LongIndex pointer_offset) {
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kLoadsPerAccess; ++i) {
-      pointers_ += pointer_offset / LoadType::kElements;
+      pointers_[i] += pointer_offset / LoadType::kElements;
     }
   }
 
@@ -179,7 +179,9 @@ class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8> {
   void add_tile_offset(TensorCoord const &offset) {
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kLoadsPerAccess; ++i) {
-      pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements;
+      pointers_[i] += 
+        offset.row() * Shape::kRow * stride_ + 
+        offset.column() * Shape::kColumn / LoadType::kElements;
     }
   }
 
@@ -236,7 +238,7 @@ class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for int32_t x 16 => int8_t x 16
+/// Partial specialization for int32_t x 8 => int8_t x 8
 template <
   typename ThreadMap_       ///< Thread map (conept: OutputTileThreadMap)
 >
@@ -339,7 +341,9 @@ class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 8, 16, 8> {
   void add_tile_offset(TensorCoord const &offset) {
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kLoadsPerAccess; ++i) {
-      pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements;
+      pointers_[i] += 
+        offset.row() * Shape::kRow * stride_ + 
+        offset.column() * Shape::kColumn / LoadType::kElements;
     }
   }
 
@@ -497,7 +501,9 @@ class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 8, 8, 8> {
   void add_tile_offset(TensorCoord const &offset) {
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kLoadsPerAccess; ++i) {
-      pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements;
+      pointers_[i] += 
+        offset.row() * Shape::kRow * stride_ + 
+        offset.column() * Shape::kColumn / LoadType::kElements;
     }
   }
 
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
index 1bab9104c7..633d92193c 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
index 4c95649244..6117e167de 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_simt.h b/include/cutlass/epilogue/warp/fragment_iterator_simt.h
index 6d75e5697b..b2ed96cf44 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_simt.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
index f620e4bddf..b028dedfde 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
index 1abbbdc03c..c826b2be55 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
index b2a0612ac5..fbceee5dba 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/simt_policy.h b/include/cutlass/epilogue/warp/simt_policy.h
index 3e096978da..058a6c4413 100644
--- a/include/cutlass/epilogue/warp/simt_policy.h
+++ b/include/cutlass/epilogue/warp/simt_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/tensor_op_policy.h b/include/cutlass/epilogue/warp/tensor_op_policy.h
index fd085c47b6..93eeda3e56 100644
--- a/include/cutlass/epilogue/warp/tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/tile_iterator_simt.h b/include/cutlass/epilogue/warp/tile_iterator_simt.h
index a9d03db1c3..552f15b3f2 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_simt.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
index 33cee0d375..7c22af81f0 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
index 82a93e2d00..cec0b8f27e 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
index 1754f58016..75c064e285 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
index c8eab0ceb1..1ea6dd4f43 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
index b0ecc5eb6f..e8e14f3e4d 100644
--- a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
index 7b938d3712..b1bc6cf3b1 100644
--- a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
index 4d9503e5f2..c54bdac52f 100644
--- a/include/cutlass/fast_math.h
+++ b/include/cutlass/fast_math.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -30,9 +30,12 @@
 #else
 #include <cstdint>
 #include <cmath>
+#include <type_traits>
 #endif
 
 #include "cutlass/cutlass.h"
+#include "cutlass/uint128.h"
+#include "cutlass/coord.h"
 
 /**
  * \file
@@ -151,6 +154,7 @@ constexpr int ceil_div(int a, int b) {
   return (a + b - 1) / b;
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
 /**
  * log2 computation, what's the
@@ -221,6 +225,8 @@ void fast_divmod(int& quo, int64_t& rem, int64_t src, int div, unsigned int mul,
   rem = src - (quo * div);
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Object to encapsulate the fast division+modulus operation.
 ///
 /// This object precomputes two values used to accelerate the computation and is best used
@@ -272,9 +278,159 @@ struct FastDivmod {
   }
 };
 
-/******************************************************************************
- * Min/Max
- ******************************************************************************/
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Object to encapsulate the fast division+modulus operation for 64b integer division.
+///
+/// This object precomputes two values used to accelerate the computation and is best used
+/// when the divisor is a grid-invariant. In this case, it may be computed in host code and
+/// marshalled along other kernel arguments using the 'Params' pattern.
+///
+/// Example:
+///
+///
+///   uint64_t quotient, remainder, dividend, divisor;
+///
+///   FastDivmodU64 divmod(divisor);
+///
+///   divmod(quotient, remainder, dividend);  
+///
+///   // quotient = (dividend / divisor)
+///   // remainder = (dividend % divisor)
+///
+struct FastDivmodU64 {
+
+  uint64_t divisor;
+  uint64_t multiplier;
+  unsigned int shift_right;
+  unsigned int round_up;
+
+  //
+  // Static methods
+  //
+
+  /// Computes b, where 2^b is the greatest power of two that is less than or equal to x
+  CUTLASS_HOST_DEVICE
+  static uint32_t integer_log2(uint64_t x) {
+    uint32_t n = 0;
+    while (x >>= 1) {
+      ++n;
+    }
+    return n;
+  }
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64(): divisor(0), multiplier(0), shift_right(0), round_up(0) { }
+
+  /// Construct the FastDivmod object, in host code ideally.
+  ///
+  /// This precomputes some values based on the divisor and is computationally expensive.
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64(uint64_t divisor_): divisor(divisor_), multiplier(1), shift_right(0), round_up(0) {
+
+    if (divisor) {
+      shift_right = integer_log2(divisor);
+
+      if ((divisor & (divisor - 1)) == 0) {
+        multiplier = 0;
+      }
+      else {
+        uint64_t power_of_two = (uint64_t(1) << shift_right);
+        uint64_t multiplier_lo = uint128_t(0, power_of_two) / divisor;
+        multiplier = uint128_t(power_of_two, power_of_two) / divisor;
+        round_up = (multiplier_lo == multiplier ? 1 : 0);
+      }
+    }
+  }
+
+  /// Returns the quotient of floor(dividend / divisor)
+  CUTLASS_HOST_DEVICE
+  uint64_t divide(uint64_t dividend) const {
+    uint64_t quotient = 0;
+
+    #ifdef __CUDA_ARCH__
+      uint64_t x = dividend;
+      if (multiplier) {
+        x = __umul64hi(dividend + round_up, multiplier);
+      }
+      quotient = (x >> shift_right);
+    #else
+      // TODO - use proper 'fast' division here also. No reason why x86-code shouldn't be optimized.
+      quotient = dividend / divisor;
+    #endif
+
+    return quotient;
+  }
+
+  /// Computes the remainder given a computed quotient and dividend
+  CUTLASS_HOST_DEVICE
+  uint64_t modulus(uint64_t quotient, uint64_t dividend) const {
+    return uint32_t(dividend - quotient * divisor);
+  }
+
+  /// Returns the quotient of floor(dividend / divisor) and computes the remainder
+  CUTLASS_HOST_DEVICE
+  uint64_t divmod(uint64_t &remainder, uint64_t dividend) const {
+    uint64_t quotient = divide(dividend);
+    remainder = modulus(quotient, dividend);
+    return quotient;
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  void operator()(uint64_t &quotient, uint64_t &remainder, uint64_t dividend) const {
+    quotient = divmod(remainder, dividend);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes the coordinate decomposition from a linear index.
+///
+/// This decomposition is accelerated by the FastDivmodU64 object. It is assumed that
+/// a coordinate of <Rank> indices can be decomposed by <Rank - 1> div/mod operations.
+/// Note, is assumed that element divmod[0] divides by extent[1].
+///
+/// For example, assume 4-D coordinate (n, p, q, c) is mapped to a linear index `npqc`. This
+/// can be decomposed via three divide and modulus operations:
+///
+///      c = npqc % C;         |  divmod[2] = FastDivmodU64(C)
+///    npq = npqc / C;         |   coord[3] = c
+///
+///      q =  npq % Q;         |  divmod[1] = FastDivmodU64(Q)
+///     np =  npq / Q;         |   coord[2] = q
+///
+///      p =   np % P;         |  divmod[0] = FastDivmodU64(P)
+///      n =   np / P;         |   coord[1] = p
+///
+///                            |   coord[0] = n
+///
+template <int Rank>
+CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecomposition(
+  uint64_t linear_idx,                    ///< Linear index to decompose
+  FastDivmodU64 const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
+
+  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
+
+  Coord<Rank> coord;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = Rank; i > 1; --i) {
+    uint64_t remainder;
+    linear_idx = divmod[i - 2].divmod(remainder, linear_idx);
+    coord[i - 1] = int(remainder);
+  }
+
+  coord[0] = int(linear_idx);
+
+  return coord;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Min/Max
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <int A, int B>
 struct Min {
@@ -296,6 +452,30 @@ constexpr int const_max(int a, int b) {
     return (b > a ? b : a);
 }
 
+template <typename T>
+CUTLASS_HOST_DEVICE
+T fast_min(T a, T b) {
+  return (b < a ? b : a);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+float fast_min(float a, float b) {
+  return fminf(a, b);
+}
+
+template <typename T>
+CUTLASS_HOST_DEVICE
+T fast_max(T a, T b) {
+  return (a < b ? b : a);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+float fast_max(float a, float b) {
+  return fmaxf(a, b);
+}
+
 CUTLASS_HOST_DEVICE
 float fast_cos(float theta) {
   #if defined(__CUDA_ARCH__)
@@ -404,6 +584,24 @@ double fast_log(double x) {
   #endif
 }
 
+CUTLASS_HOST_DEVICE
+float fast_tanh(float x) {
+  #if defined(__CUDA_ARCH__)
+  return ::tanhf(x);
+  #else
+  return std::tanh(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_tanh(double x) {
+  #if defined(__CUDA_ARCH__)
+  return ::tanh(x);
+  #else
+  return std::tanh(x);
+  #endif
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace cutlass
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index d20c45df2e..52d4ca59e6 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -1,5 +1,5 @@
   /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -235,6 +235,156 @@ struct conjugate {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <typename T>
+struct logical_and {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return ((a && b) ? T(1) : T());
+  }
+};
+
+template <typename T>
+struct logical_or {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return ((a || b) ? T(1) : T());
+  }
+};
+
+template <typename T>
+struct logical_not {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a) const {
+    return T(!(a));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct bit_and {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return a & b;
+  }
+};
+
+template <typename T>
+struct bit_or {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return a | b;
+  }
+};
+
+template <typename T>
+struct bit_not {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a) const {
+    return ~a;
+  }
+};
+
+template <typename T>
+struct bit_xor {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return a ^ b;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Partial specializations for Arrays
+template <int N>
+struct bit_and<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] & b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+// Partial specializations for Arrays
+template <int N>
+struct bit_or<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] | b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+
+// Partial specializations for Arrays
+template <int N>
+struct bit_not<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (~a_data[i]);
+    }
+
+    return result;
+  }
+};
+
+// Partial specializations for Arrays
+template <int N>
+struct bit_xor<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] ^ b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 template <typename T>
 struct conjugate<complex<T>>  {
   CUTLASS_HOST_DEVICE
diff --git a/include/cutlass/gemm/device/default_gemm_configuration.h b/include/cutlass/gemm/device/default_gemm_configuration.h
index c65b3f0062..ad38bf63e3 100644
--- a/include/cutlass/gemm/device/default_gemm_configuration.h
+++ b/include/cutlass/gemm/device/default_gemm_configuration.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h
index 70383e15ef..e1d0092cdb 100644
--- a/include/cutlass/gemm/device/gemm.h
+++ b/include/cutlass/gemm/device/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -133,7 +133,9 @@ namespace device {
       /// Operator class tag
       typename OperatorClass,
       
-      /// Tag indicating architecture to tune for
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
       typename ArchTag,
       
       /// Threadblock-level tile size (concept: GemmShape)
@@ -211,9 +213,7 @@ template <
     /// Operation performed by GEMM
     typename Operator_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Whether Beta is zero or not
-    bool IsBetaZero = false>
+        ElementAccumulator_>::Operator>
 class Gemm {
  public:
 
@@ -241,7 +241,6 @@ class Gemm {
   static int const kAlignmentB = AlignmentB;
   static int const kAlignmentC = EpilogueOutputOp::kCount;
   static bool const kSplitKSerial = SplitKSerial;
-  static bool const kIsBetaZero = IsBetaZero;
   static ComplexTransform const kTransformA = ComplexTransform::kNone;
   static ComplexTransform const kTransformB = ComplexTransform::kNone;
 
@@ -265,8 +264,7 @@ class Gemm {
     ThreadblockSwizzle,
     kStages,
     kSplitKSerial,
-    Operator,
-    kIsBetaZero
+    Operator
   >::GemmKernel;
 
   /// Argument structure
@@ -533,15 +531,13 @@ template <
     /// If true, kernel supports split-K as a serial reduction
     bool SplitKSerial,
     /// Operation performed by GEMM
-    typename Operator_,
-    /// Beta is zero or not
-    bool IsBetaZero>
+    typename Operator_>
 class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
            layout::ColumnMajor,  // partially specialized on LayoutC
            ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
            WarpShape_, InstructionShape_, EpilogueOutputOp_,
            ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial,
-           Operator_, IsBetaZero> {
+           Operator_> {
  public:
 
   using ElementA = ElementA_;
@@ -569,7 +565,6 @@ class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
   static ComplexTransform const kTransformA = ComplexTransform::kNone;
   static ComplexTransform const kTransformB = ComplexTransform::kNone;
   static bool const kSplitKSerial = SplitKSerial;
-  static bool const kIsBetaZero = IsBetaZero;
 
   using UnderlyingOperator = Gemm< 
     ElementB,
@@ -590,8 +585,7 @@ class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
     kAlignmentB,
     kAlignmentA,
     SplitKSerial,
-    Operator,
-    kIsBetaZero
+    Operator
   >;
 
   using UnderlyingArguments = typename UnderlyingOperator::Arguments;
diff --git a/include/cutlass/gemm/device/gemm_array.h b/include/cutlass/gemm/device/gemm_array.h
index c44579e005..12bc300ff2 100644
--- a/include/cutlass/gemm/device/gemm_array.h
+++ b/include/cutlass/gemm/device/gemm_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -133,7 +133,9 @@ namespace device {
       /// Operator class tag
       typename OperatorClass,
       
-      /// Tag indicating architecture to tune for
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
       typename ArchTag,
       
       /// Threadblock-level tile size (concept: GemmShape)
@@ -257,8 +259,7 @@ class GemmArray {
     ThreadblockSwizzle,
     kStages,
     false,
-    Operator,
-    false
+    Operator
   >::GemmKernel;
 
   using GemmKernel = kernel::GemmArray<typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue, ThreadblockSwizzle>;
diff --git a/include/cutlass/gemm/device/gemm_batched.h b/include/cutlass/gemm/device/gemm_batched.h
index 052bd90093..8f09b4a77c 100644
--- a/include/cutlass/gemm/device/gemm_batched.h
+++ b/include/cutlass/gemm/device/gemm_batched.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -133,7 +133,9 @@ namespace device {
       /// Operator class tag
       typename OperatorClass,
       
-      /// Tag indicating architecture to tune for
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
       typename ArchTag,
       
       /// Threadblock-level tile size (concept: GemmShape)
@@ -257,8 +259,7 @@ class GemmBatched {
     ThreadblockSwizzle,
     kStages,
     false,
-    Operator,
-    false
+    Operator
   >::GemmKernel;
 
   using GemmKernel = kernel::GemmBatched<typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue, ThreadblockSwizzle>;
diff --git a/include/cutlass/gemm/device/gemm_complex.h b/include/cutlass/gemm/device/gemm_complex.h
index 8ad1036bb1..70e0b46a38 100644
--- a/include/cutlass/gemm/device/gemm_complex.h
+++ b/include/cutlass/gemm/device/gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -47,37 +47,40 @@ namespace device {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
-  be invoked from host code.
+/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM
+  kernels that may be invoked from host code.
 
   The contributions of this class are:
-    
-    1. At compile time, it maps data types and high-level structural parameters onto 
-       specific CUTLASS components.
 
-    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
+    1. At compile time, it maps data types and high-level structural parameters
+  onto specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to GEMM problems to kernel
+  parameters.
 
     3. At runtime, it launches kernels on the device.
 
-  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
-  configurations for each supported architecture. Consequently, not all parameters are exposed
-  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
-  are selected to tradeoff simplicity of the interface with flexibility. We expect 
-  most configurations to be specified at this level. Applications with more exotic requirements 
-  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
-  and thread levels of abstraction.
+  The intent is to provide a convenient mechanism for interacting with most
+  plausible GEMM configurations for each supported architecture. Consequently,
+  not all parameters are exposed to the top-level interface. Rather, sensible
+  defaults at each level of the CUTLASS hierarchy are selected to tradeoff
+  simplicity of the interface with flexibility. We expect most configurations to
+  be specified at this level. Applications with more exotic requirements may
+  construct their kernels of interest using CUTLASS components at the
+  threadblock, warp, and thread levels of abstraction.
 
-  CUTLASS exposes computations using the functor design pattern in which objects compose some
-  internal state with an overloaded function call operator. This enables decoupling of
-  initialization from execution, possibly reducing overhead during steady state phases of
-  application execution.
+  CUTLASS exposes computations using the functor design pattern in which objects
+  compose some internal state with an overloaded function call operator. This
+  enables decoupling of initialization from execution, possibly reducing
+  overhead during steady state phases of application execution.
 
-  CUTLASS device-level operators expose an Arguments structure encompassing each logical
-  input to the computation. This is distinct from the kernel-level Params structure pattern
-  which contains application-specific precomputed state needed by the device code.
+  CUTLASS device-level operators expose an Arguments structure encompassing each
+  logical input to the computation. This is distinct from the kernel-level
+  Params structure pattern which contains application-specific precomputed state
+  needed by the device code.
 
-  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
-  is as follows:
+  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's
+  SGEMM NN is as follows:
 
     //
     // Instantiate the CUTLASS GEMM operator.
@@ -111,46 +114,48 @@ namespace device {
     template <
       /// Element type for A matrix operand
       typename ElementA,
-      
+
       /// Layout type for A matrix operand
       typename LayoutA,
-      
+
       /// Element type for B matrix operand
       typename ElementB,
-      
+
       /// Layout type for B matrix operand
       typename LayoutB,
-      
+
       /// Element type for C and D matrix operands
       typename ElementC,
-      
+
       /// Layout type for C and D matrix operands
       typename LayoutC,
-      
+
       /// Element type for internal accumulation
       typename ElementAccumulator,
 
       /// Operator class tag
       typename OperatorClass,
-      
-      /// Tag indicating architecture to tune for
+
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
       typename ArchTag,
-      
+
       /// Threadblock-level tile size (concept: GemmShape)
       typename ThreadblockShape,
-      
+
       /// Warp-level tile size (concept: GemmShape)
       typename WarpShape,
-      
+
       /// Warp-level tile size (concept: GemmShape)
       typename InstructionShape,
-      
+
       /// Epilogue output operator
       typename EpilogueOutputOp,
-      
+
       /// Threadblock-level swizzling operator
       typename ThreadblockSwizzle,
-      
+
       /// Number of stages used in the pipelined mainloop
       int Stages
     >
@@ -173,7 +178,7 @@ template <
     typename ElementAccumulator_ = ElementC_,
     /// Operator class tag
     typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for
+    /// Tag indicating architecture to tune for.
     typename ArchTag_ = arch::Sm70,
     /// Threadblock-level tile size (concept: GemmShape)
     typename ThreadblockShape_ = typename DefaultGemmConfiguration<
@@ -192,7 +197,8 @@ template <
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::EpilogueOutputOp,
     /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    typename ThreadblockSwizzle_ =
+        threadblock::GemmIdentityThreadblockSwizzle<>,
     /// Number of stages used in the pipelined mainloop
     int Stages =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
@@ -201,12 +207,11 @@ template <
     ComplexTransform TransformA = ComplexTransform::kNone,
     /// Complex elementwise transformation on B operand
     ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Multiply-add operator 
+    /// Multiply-add operator
     // (selects complex or gaussian complex)
     typename Operator_ = arch::OpMultiplyAddComplex,
     /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false
->
+    bool SplitKSerial = false>
 class GemmComplex {
  public:
 
diff --git a/include/cutlass/gemm/device/gemm_sparse.h b/include/cutlass/gemm/device/gemm_sparse.h
index bfd5606e1f..04e2dd6673 100644
--- a/include/cutlass/gemm/device/gemm_sparse.h
+++ b/include/cutlass/gemm/device/gemm_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -133,7 +133,9 @@ namespace device {
       /// Operator class tag
       typename OperatorClass,
       
-      /// Tag indicating architecture to tune for
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
       typename ArchTag,
       
       /// Threadblock-level tile size (concept: GemmShape)
@@ -211,9 +213,7 @@ template <
     /// Operation performed by GEMM
     typename Operator_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Whether Beta is zero or not
-    bool IsBetaZero = false>
+        ElementAccumulator_>::Operator>
 class SparseGemm {
  public:
 
@@ -241,7 +241,6 @@ class SparseGemm {
   static int const kAlignmentB = AlignmentB;
   static int const kAlignmentC = EpilogueOutputOp::kCount;
   static bool const kSplitKSerial = SplitKSerial;
-  static bool const kIsBetaZero = IsBetaZero;
   static ComplexTransform const kTransformA = ComplexTransform::kNone;
   static ComplexTransform const kTransformB = ComplexTransform::kNone;
 
@@ -265,8 +264,7 @@ class SparseGemm {
     ThreadblockSwizzle,
     kStages,
     kSplitKSerial,
-    Operator,
-    kIsBetaZero
+    Operator
   >::GemmKernel;
 
   using ElementE = typename GemmKernel::ElementE;
diff --git a/include/cutlass/gemm/device/gemm_splitk_parallel.h b/include/cutlass/gemm/device/gemm_splitk_parallel.h
index 73f1c240b0..987319c2cc 100644
--- a/include/cutlass/gemm/device/gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/device/gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,9 @@ template <
     typename ElementAccumulator_ = ElementC_,
     /// Operator class tag
     typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
     typename ArchTag_ = arch::Sm70,
     /// Threadblock-level tile size (concept: GemmShape)
     typename ThreadblockShape_ = typename DefaultGemmConfiguration<
@@ -425,7 +427,9 @@ template <
     typename ElementAccumulator_,
     /// Operator class tag
     typename OperatorClass_,
-    /// Tag indicating architecture to tune for
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
     typename ArchTag_,
     /// Threadblock-level tile size (concept: GemmShape)
     typename ThreadblockShape_,
diff --git a/include/cutlass/gemm/device/gemm_universal.h b/include/cutlass/gemm/device/gemm_universal.h
index 0912909014..54f8e14932 100644
--- a/include/cutlass/gemm/device/gemm_universal.h
+++ b/include/cutlass/gemm/device/gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,9 @@ template <
     typename ElementAccumulator_ = ElementC_,
     /// Operator class tag
     typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
     typename ArchTag_ = arch::Sm70,
     /// Threadblock-level tile size (concept: GemmShape)
     typename ThreadblockShape_ = typename DefaultGemmConfiguration<
@@ -202,7 +204,9 @@ template <
     typename ElementAccumulator_,
     /// Operator class tag
     typename OperatorClass_,
-    /// Tag indicating architecture to tune for
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
     typename ArchTag_,
     /// Threadblock-level tile size (concept: GemmShape)
     typename ThreadblockShape_,
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index a669483541..fb54170134 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h
index 9ffc6b041c..74c519a44e 100644
--- a/include/cutlass/gemm/device/gemm_universal_base.h
+++ b/include/cutlass/gemm/device/gemm_universal_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/gemm.h b/include/cutlass/gemm/gemm.h
index 51f535f7c1..62725ffe08 100644
--- a/include/cutlass/gemm/gemm.h
+++ b/include/cutlass/gemm/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/default_gemm.h b/include/cutlass/gemm/kernel/default_gemm.h
index 0aba2d3a72..966b00890e 100644
--- a/include/cutlass/gemm/kernel/default_gemm.h
+++ b/include/cutlass/gemm/kernel/default_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -111,9 +111,7 @@ template <
     /// epilogue
     bool SplitKSerial,
     /// Operation performed by GEMM
-    typename Operator,
-    /// Beta is zero or not
-    bool IsBetaZero = false>
+    typename Operator>
 struct DefaultGemm;
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -295,16 +293,14 @@ template <
     /// epilogue
     bool SplitKSerial,
     /// Operation performed by GEMM
-    typename Operator,
-    /// Is Beta zero or not
-    bool IsBetaZero>
+    typename Operator>
 struct DefaultGemm<
     ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
     ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, ElementC,
     layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
     arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
     InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
-    SplitKSerial, Operator, IsBetaZero> {
+    SplitKSerial, Operator> {
   using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
   using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
   using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
@@ -324,8 +320,7 @@ struct DefaultGemm<
   using Epilogue = typename cutlass::epilogue::threadblock::
       DefaultInterleavedEpilogueTensorOp<
           ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          64 / sizeof_bits<ElementC>::value, InterleavedK,
-          IsBetaZero>::Epilogue;
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
 
   /// Define the kernel-level GEMM operator.
   using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
@@ -361,16 +356,14 @@ template <
     /// epilogue
     bool SplitKSerial,
     /// Operation performed by GEMM
-    typename Operator,
-    /// Is Beta zero or not
-    bool IsBetaZero>
+    typename Operator>
 struct DefaultGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
                    kAlignmentA, ElementB,
                    layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
                    ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
                    int32_t, arch::OpClassTensorOp, arch::Sm75, ThreadblockShape,
                    WarpShape, InstructionShape, EpilogueOutputOp,
-                   ThreadblockSwizzle, 2, SplitKSerial, Operator, IsBetaZero> {
+                   ThreadblockSwizzle, 2, SplitKSerial, Operator> {
   using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
   using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
   using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
@@ -389,8 +382,7 @@ struct DefaultGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
   using Epilogue = typename cutlass::epilogue::threadblock::
       DefaultInterleavedEpilogueTensorOp<
           ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          64 / sizeof_bits<ElementC>::value, InterleavedK,
-          IsBetaZero>::Epilogue;
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
 
   /// Define the kernel-level GEMM operator.
   using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
@@ -682,7 +674,7 @@ struct DefaultGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
                    ElementC, LayoutC, ElementAccumulator, arch::OpClassSimt,
                    ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>,
                    EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial,
-                   Operator, false> {
+                   Operator> {
   using InstructionShape = GemmShape<1, 1, 4>;
   using ElementA = int8_t;
   using ElementB = int8_t;
@@ -703,8 +695,7 @@ struct DefaultGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
       WarpShape,
       InstructionShape,
       2,
-      Operator,
-      false
+      Operator
       >::ThreadblockMma;
 
   static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
diff --git a/include/cutlass/gemm/kernel/default_gemm_complex.h b/include/cutlass/gemm/kernel/default_gemm_complex.h
index cff06e69de..350b3484c1 100644
--- a/include/cutlass/gemm/kernel/default_gemm_complex.h
+++ b/include/cutlass/gemm/kernel/default_gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
index 870084834a..a60cf02452 100644
--- a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
+++ b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse.h b/include/cutlass/gemm/kernel/default_gemm_sparse.h
index 9c43666fe0..e212f6bfca 100644
--- a/include/cutlass/gemm/kernel/default_gemm_sparse.h
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -112,9 +112,7 @@ template <
     /// epilogue
     bool SplitKSerial,
     /// Operation performed by GEMM
-    typename Operator,
-    /// Beta is zero or not
-    bool IsBetaZero = false>
+    typename Operator>
 struct DefaultSparseGemm;
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
index e23965d336..d97a93f498 100644
--- a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/default_gemm_universal.h b/include/cutlass/gemm/kernel/default_gemm_universal.h
index 579005cb41..f9094672d9 100644
--- a/include/cutlass/gemm/kernel/default_gemm_universal.h
+++ b/include/cutlass/gemm/kernel/default_gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -185,8 +185,7 @@ struct DefaultGemmUniversal<
     ThreadblockSwizzle,
     Stages,
     true,
-    Operator,
-    false
+    Operator
   >::GemmKernel;
 
     /// Define the kernel in terms of the default kernel
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_reduction.h b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
new file mode 100644
index 0000000000..47c075c920
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
@@ -0,0 +1,240 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *   * Redistributions of source code must retain the above copyright notice, this list of
+ *     conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *     conditions and the following disclaimer in the documentation and/or other materials
+ *     provided with the distribution.
+ *   * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *     to endorse or promote products derived from this software without specific prior written
+ *     permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Reduction based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/gemm_with_fused_epilogue.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Epilogue reduction operator
+  typename EpilogueReductionOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable = void
+>
+struct DefaultGemmWithReduction {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Replace epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    EpilogueOutputOp,
+    EpilogueReductionOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parital specialization: ArchTag = cutlass::arch::Sm70
+///
+///
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Epilogue reduction operator
+  typename EpilogueReductionOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable
+>
+struct DefaultGemmWithReduction<
+  ElementA_, LayoutA_, TransformA, kAlignmentA, 
+  ElementB_, LayoutB_, TransformB, kAlignmentB,
+  ElementC_, LayoutC_,
+  ElementAccumulator,
+  OperatorClass,
+  cutlass::arch::Sm70,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  EpilogueReductionOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  Enable
+  >  {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    cutlass::arch::Sm70,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Replace epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionVoltaTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    EpilogueOutputOp,
+    EpilogueReductionOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemv.h b/include/cutlass/gemm/kernel/default_gemv.h
index 36ae339c4e..03d9c43c52 100755
--- a/include/cutlass/gemm/kernel/default_gemv.h
+++ b/include/cutlass/gemm/kernel/default_gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm.h b/include/cutlass/gemm/kernel/gemm.h
index ce61137f36..1d5601cdd8 100644
--- a/include/cutlass/gemm/kernel/gemm.h
+++ b/include/cutlass/gemm/kernel/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_array.h b/include/cutlass/gemm/kernel/gemm_array.h
index 1c59a53ae0..0df217421d 100644
--- a/include/cutlass/gemm/kernel/gemm_array.h
+++ b/include/cutlass/gemm/kernel/gemm_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_batched.h b/include/cutlass/gemm/kernel/gemm_batched.h
index 45ec7756f7..ceefed127f 100644
--- a/include/cutlass/gemm/kernel/gemm_batched.h
+++ b/include/cutlass/gemm/kernel/gemm_batched.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_pipelined.h b/include/cutlass/gemm/kernel/gemm_pipelined.h
index 02c7ba254b..39f328a30b 100644
--- a/include/cutlass/gemm/kernel/gemm_pipelined.h
+++ b/include/cutlass/gemm/kernel/gemm_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex.h b/include/cutlass/gemm/kernel/gemm_planar_complex.h
index b9626145fe..0151848f38 100644
--- a/include/cutlass/gemm/kernel/gemm_planar_complex.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
index e7fa89dc74..05bde223bf 100644
--- a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
index 72ca5a4743..e009567e4d 100644
--- a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h
index bba6217160..0ff5ce999c 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.h
+++ b/include/cutlass/gemm/kernel/gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemv_batched_strided.h b/include/cutlass/gemm/kernel/gemv_batched_strided.h
index ea8d9bdf85..63f4d6e37e 100755
--- a/include/cutlass/gemm/kernel/gemv_batched_strided.h
+++ b/include/cutlass/gemm/kernel/gemv_batched_strided.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/sparse_gemm.h b/include/cutlass/gemm/kernel/sparse_gemm.h
index 730745fdc8..9d9e0a282d 100644
--- a/include/cutlass/gemm/kernel/sparse_gemm.h
+++ b/include/cutlass/gemm/kernel/sparse_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/thread/mma.h b/include/cutlass/gemm/thread/mma.h
index 15dfe4338e..e163d8930a 100644
--- a/include/cutlass/gemm/thread/mma.h
+++ b/include/cutlass/gemm/thread/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/thread/mma_sm50.h b/include/cutlass/gemm/thread/mma_sm50.h
index 6d52efb023..e7bbbc90a6 100644
--- a/include/cutlass/gemm/thread/mma_sm50.h
+++ b/include/cutlass/gemm/thread/mma_sm50.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/thread/mma_sm60.h b/include/cutlass/gemm/thread/mma_sm60.h
index 07e2d55629..562c682e8a 100644
--- a/include/cutlass/gemm/thread/mma_sm60.h
+++ b/include/cutlass/gemm/thread/mma_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/thread/mma_sm61.h b/include/cutlass/gemm/thread/mma_sm61.h
index 09fd356236..81430d986a 100644
--- a/include/cutlass/gemm/thread/mma_sm61.h
+++ b/include/cutlass/gemm/thread/mma_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_gemv_core.h b/include/cutlass/gemm/threadblock/default_gemv_core.h
index 9d692d6db5..a4ac423ebe 100755
--- a/include/cutlass/gemm/threadblock/default_gemv_core.h
+++ b/include/cutlass/gemm/threadblock/default_gemv_core.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma.h b/include/cutlass/gemm/threadblock/default_mma.h
index fbf76510db..155508096c 100644
--- a/include/cutlass/gemm/threadblock/default_mma.h
+++ b/include/cutlass/gemm/threadblock/default_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_core.h b/include/cutlass/gemm/threadblock/default_mma_core.h
index a7ac7c44b2..5a5426f4c6 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
index ba3a161650..2ec882cc45 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_simt.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
index 30b3b3c0aa..4e75154630 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
index d797704e79..ded7f119d3 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
index 065ed74694..8b0c0de628 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
index f7298e4e7e..26c9b95b10 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
index 8214494321..64efa9a0f4 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
index 2f4a079619..d5f963bd01 100644
--- a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
+++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
index 04a856e9a4..a204f95410 100644
--- a/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
+++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
index 36c5c54ee9..0c3972145e 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
index 613c88e3ea..6a1e48fc0c 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
index 697d22bf6d..9528744b7d 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_sparse_mma.h b/include/cutlass/gemm/threadblock/default_sparse_mma.h
index 3f6354771e..b390382a0f 100644
--- a/include/cutlass/gemm/threadblock/default_sparse_mma.h
+++ b/include/cutlass/gemm/threadblock/default_sparse_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/gemv.h b/include/cutlass/gemm/threadblock/gemv.h
new file mode 100755
index 0000000000..584b375da6
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/gemv.h
@@ -0,0 +1,141 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a threadblock-scoped GEMV kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix-vector product using SIMT math instructions.
+template <
+  class Core_ //< GemvCore
+>
+class Gemv {
+public:
+  using Shape = typename Core_::Shape;
+
+  /// The MMA operator that computes GEMV 
+  using Operator = typename Core_::Operator;
+
+  /// Iterates over A in global memory
+  using IteratorA = typename Core_::IteratorA;
+
+  /// Iterates over B in global memory
+  using IteratorB = typename Core_::IteratorB;
+
+  /// Fragment of operand C loaded from global memory
+  using IteratorC = typename Core_::IteratorC;
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of operand accumulator loaded/stored to global memory
+  using FragmentC = typename Operator::FragmentC;
+
+  /// Shape of the per-thread GEMV operation
+  using ThreadShape = typename Core_::ThreadShape;
+
+public:
+  CUTLASS_DEVICE
+  Gemv() { }
+
+  CUTLASS_DEVICE
+  void operator()(
+    GemmCoord const &problem_size,    ///< problem size of batched GEMV
+    FragmentC &accum,                 ///< destination accumulator tile
+    IteratorA iterator_A,             ///< iterator over A operand in global memory
+    IteratorB iterator_B,             ///< iterator over B operand in global memory
+    FragmentC const &src_accum) {     ///< source accumualtor tile
+
+    //
+    // Prologue
+    //
+
+    FragmentA frag_A;
+    FragmentB frag_B;
+    frag_A.clear();
+    frag_B.clear();
+
+    iterator_A.load(frag_A);
+    iterator_B.load(frag_B);
+    ++iterator_A;
+    ++iterator_B;
+
+    //
+    // Mainloop
+    //
+    Operator thread_mma;
+    int gemm_k = problem_size.k();
+
+    if (gemm_k < Shape::kK)
+    {
+      iterator_A.clear_mask();
+      iterator_B.clear_mask();
+    }
+
+    // iterate over K to accumulate result
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k > 0; gemm_k -= Shape::kK) {
+      thread_mma(accum, frag_A, frag_B, accum);
+
+      iterator_A.load(frag_A);
+      iterator_B.load(frag_B);
+      ++iterator_A;
+      ++iterator_B;
+
+      if (gemm_k < Shape::kK)
+      {
+        iterator_A.clear_mask();
+        iterator_B.clear_mask();
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/include/cutlass/gemm/threadblock/mma_base.h b/include/cutlass/gemm/threadblock/mma_base.h
index dbf3d31f56..a56d81f0e0 100644
--- a/include/cutlass/gemm/threadblock/mma_base.h
+++ b/include/cutlass/gemm/threadblock/mma_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/mma_multistage.h b/include/cutlass/gemm/threadblock/mma_multistage.h
index 804e3373a3..d07b236d40 100644
--- a/include/cutlass/gemm/threadblock/mma_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/mma_pipelined.h b/include/cutlass/gemm/threadblock/mma_pipelined.h
index 80954f6c4f..5fcbdebe1a 100644
--- a/include/cutlass/gemm/threadblock/mma_pipelined.h
+++ b/include/cutlass/gemm/threadblock/mma_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
index b37b418462..22c9b3f863 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
index 18e63b5805..fedad053b0 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
index ecf722d92a..0e48b2bd1a 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/mma_singlestage.h b/include/cutlass/gemm/threadblock/mma_singlestage.h
index 373d985ac6..edcef03699 100644
--- a/include/cutlass/gemm/threadblock/mma_singlestage.h
+++ b/include/cutlass/gemm/threadblock/mma_singlestage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/mma_sparse_base.h b/include/cutlass/gemm/threadblock/mma_sparse_base.h
index c6bb3411fc..eb192f723e 100644
--- a/include/cutlass/gemm/threadblock/mma_sparse_base.h
+++ b/include/cutlass/gemm/threadblock/mma_sparse_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/mma_sparse_multistage.h b/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
index a2ff84664a..e865585b1c 100644
--- a/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
index 587de56a66..79314088dd 100644
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
index 3c6772aff7..b397b4567d 100644
--- a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
index 637e39009e..bdc2341e3f 100644
--- a/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_tensor_op.h
index ea9ab5c931..8240c430b4 100644
--- a/include/cutlass/gemm/warp/default_mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
index 06d3afa59f..c550b022f7 100644
--- a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
+++ b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
index 582fb472e1..5f8864a500 100644
--- a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma.h b/include/cutlass/gemm/warp/mma.h
index 16c736e2b7..7180434e1e 100644
--- a/include/cutlass/gemm/warp/mma.h
+++ b/include/cutlass/gemm/warp/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
index a34c16df07..5877b95f3b 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
index b95af0df15..ba74fe96e3 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
index 4ab139023a..7cfad2ea6d 100644
--- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
index 8d9417b0fb..dacfe266e8 100644
--- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_planar_complex.h b/include/cutlass/gemm/warp/mma_planar_complex.h
index c579044065..6328105b78 100644
--- a/include/cutlass/gemm/warp/mma_planar_complex.h
+++ b/include/cutlass/gemm/warp/mma_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_simt.h b/include/cutlass/gemm/warp/mma_simt.h
index 306a08d17c..a86e06e461 100644
--- a/include/cutlass/gemm/warp/mma_simt.h
+++ b/include/cutlass/gemm/warp/mma_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_simt_policy.h b/include/cutlass/gemm/warp/mma_simt_policy.h
index 6abd0bf6a8..de89d5123a 100644
--- a/include/cutlass/gemm/warp/mma_simt_policy.h
+++ b/include/cutlass/gemm/warp/mma_simt_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
index ed1e598702..660db38803 100644
--- a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
+++ b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -214,7 +214,7 @@ class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::ColumnMajor, Po
     return *this;
   }
 
-  /// Loads a fragment from memory at the location pointed to by the iterator.
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
     Array<Element, Policy::LaneMmaShape::kM> *dst_ptr = 
@@ -273,6 +273,213 @@ class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::ColumnMajor, Po
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Specialization for A operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension - used in sliced-K
+  int PartitionsK,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
+    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow / Policy::WarpShape::kRow,
+    Shape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads (scalar loads)
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kM,
+    ThreadShape::kColumn
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Element, layout::RowMajor> ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, 0);
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(ref.data(), ref.stride(0));
+
+  }
+  
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({0, Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({0, -Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (scalar loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Policy::LaneMmaShape::kM; i++) {
+        
+          frag[m * Policy::LaneMmaShape::kM + i + k * Iterations::kRow] = 
+            *(ref_.data() + 
+              ref_.offset({m * Policy::WarpShape::kRow * Policy::LaneMmaShape::kM + i, k}) + 
+              pointer_offset);
+        }
+      }
+    }
+  }
+  /// Loads a fragment from memory at the location pointed to by the iterator. 
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Policy::LaneMmaShape::kM; i++) {
+
+          *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM * Policy::LaneMmaShape::kM + i, k) + pointer_offset) = 
+            frag[m * Policy::LaneMmaShape::kM + i + k * Iterations::kM];
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Specialization for B operands of row-major layouts
 ///
 /// Concept: MutableRandomAccessContiguousTileIteratorConcept
@@ -354,7 +561,6 @@ class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajor, Polic
   /// Internal reference
   cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
 
-
 public:
   
   /// Default ctor constructs null iterator
@@ -417,7 +623,7 @@ class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajor, Polic
     return *this;
   }
 
-  /// Loads a fragment from memory at the location pointed to by the iterator.
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
 
@@ -478,6 +684,210 @@ class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajor, Polic
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Specialization for B operands of column-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::ColumnMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn), 
+    "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");
+  
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
+  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow,
+    Shape::kColumn / Policy::WarpShape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Element, layout::ColumnMajor> ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(ref.data(), ref.stride(0));
+  }
+  
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({-Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (scalar loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Policy::LaneMmaShape::kN; ++i) {
+          frag[n * Policy::LaneMmaShape::kN + i + k * Iterations::kColumn] = 
+            *(ref_.data() + 
+              ref_.offset({k, n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN + i}) + 
+              pointer_offset);
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kM; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kN; ++n) {
+        *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) = 
+          src_ptr[n + k * Iterations::kN];
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Specialization for C operands of column-major layouts
 ///
 /// Concept: MutableRandomAccessContiguousTileIteratorConcept
diff --git a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
index ba86e08583..86c50d3768 100644
--- a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h
index a60a86020a..a6f83129fc 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
index 5b5b5345a0..e7a77f72ac 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
@@ -44,9 +44,7 @@ template <
     /// Shape of one matrix product operation (concept: MatrixShape)
     typename InstructionShape_,
     /// Output operation on the fragment
-    typename OutputOp_,
-    /// Whether beta is zero
-    bool IsBetaZero_ >
+    typename OutputOp_>
 class MmaTensorOpFragmentIterator;
 
 
@@ -68,7 +66,7 @@ template <
     typename OutputOp_>
 class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, Element_, Element_,
                                          cutlass::layout::ColumnMajor,
-                                         InstructionShape_, OutputOp_, true> {
+                                         InstructionShape_, OutputOp_> {
  public:
 
   /// Shape of warp tile to load (concept: MatrixShape)
@@ -105,8 +103,10 @@ class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, Ele
             !(Shape::kColumn % InstructionShape::kN),
         "Shape of warp-level Mma must be divisible by operator shape.");
     static_assert(
-        !(AccumulatorShape::kRow % Shape::kRow) &&
-            !(AccumulatorShape::kColumn % Shape::kColumn),
+        AccumulatorShape::kRow == Shape::kRow, 
+        "Rows of Warp Accumulator must be the same as rows of warp");
+    static_assert(
+        !(AccumulatorShape::kColumn % Shape::kColumn),
         "Shape of Warp Accumulator must be divisible by warp shape.");
     static_assert(
         !(kKBlockColumn % Shape::kColumn),
@@ -209,23 +209,19 @@ class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, Ele
     AccessType src_fragment;
     src_fragment.clear();
 
-
     AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
 
-    int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow;
-    int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow 
-                    * MmaIterations::kColumn;
+    int index = index_ * MmaIterations::kCount;
 
     CUTLASS_PRAGMA_UNROLL
     for (int n = 0; n < MmaIterations::kColumn; n++) {
       for (int m = 0; m < MmaIterations::kRow; m++) {
         int accumulator_access_offset = 
-            (n + index_n) * AccumulatorIterations::kRow + m + index_m;
+            n * AccumulatorIterations::kRow + m + index;
             
-        frag_ptr[n * MmaIterations::kRow + m].clear();
+        frag_ptr[m * MmaIterations::kColumn + n].clear();
         if(!(is_residual_tile_ && index_ >= kResidualIndex))
-            //frag_ptr[n * MmaIterations::kRow + m] = accumulators_[accumulator_access_offset];
-            frag_ptr[n * MmaIterations::kRow + m] = output_op(accumulators_[accumulator_access_offset], src_fragment);
+            frag_ptr[m * MmaIterations::kColumn + n] = output_op(accumulators_[accumulator_access_offset], src_fragment);
       }
     }
   }
@@ -251,7 +247,7 @@ template <
     typename OutputOp_>
 class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, ElementAccumulator_, Element_,
                                          cutlass::layout::RowMajor,
-                                         InstructionShape_, OutputOp_, true> {
+                                         InstructionShape_, OutputOp_> {
  public:
 
   /// Shape of warp tile to load (concept: MatrixShape)
@@ -294,7 +290,7 @@ class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, Ele
         AccumulatorShape::kRow == Shape::kRow, 
         "Rows of Warp Accumulator must be the same as rows of warp");
     static_assert(
-            !(AccumulatorShape::kColumn % Shape::kColumn),
+        !(AccumulatorShape::kColumn % Shape::kColumn),
         "Shape of Warp Accumulator must be divisible by warp shape.");
     static_assert(
         !(kKBlockColumn % Shape::kColumn),
@@ -415,21 +411,11 @@ class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, Ele
     src_fragment.clear();
 
     FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
-//    NumericArrayConverter<Element, ElementAccumulator, kElementsPerAccess, FloatRoundStyle::round_indeterminate> fragmentConverter;
 
     int index = index_ * AccessIterations::kCount;
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < AccessIterations::kCount; i++) {
-//      int index_m = (index % AccessIterations::kCount) / (AccessIterations::kColumn * kIterationsPerInstruction) 
-//                  * kIterationsPerInstruction + index % kIterationsPerInstruction;
-//
-//      int index_n = (index / AccessIterations::kCount) * MmaIterations::kColumn +
-//                      (index % (AccessIterations::kColumn * kIterationsPerInstruction)) 
-//                      / kIterationsPerInstruction * AccessIterations::kColumn;
-//
-//      int accumulator_access_offset = index_m / kIterationsPerInstruction * AccessIterations::kCount * kIterationsPerInstruction
-//                      + index_m % kIterationsPerInstruction + index_n * kIterationsPerInstruction;
 
       int accumulator_access_offset = index / AccessIterations::kCount * (MmaIterations::kColumn * kIterationsPerInstruction) +
                                     (index % AccessIterations::kCount) / (AccessIterations::kColumn * kIterationsPerInstruction) *
@@ -442,7 +428,6 @@ class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, Ele
   
         frag_ptr[i*kIterationsPerAccess + j].clear();
         if(!(is_residual_tile_ && index_ >= kResidualIndex))
-  //          frag_ptr[m * MmaIterations::kColumn + n] = fragmentConverter(accumulators_[accumulator_access_offset]);
               frag_ptr[i*kIterationsPerAccess + j] = output_op(accumulators_[accumulator_access_offset + j * kAccessStride], src_fragment);
       }
       index++;
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_policy.h b/include/cutlass/gemm/warp/mma_tensor_op_policy.h
index 68b28bfff1..4dd57da20a 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_policy.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
index cc1a909532..409eda4082 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
index 59f68a42a1..9d4d81d49d 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
index c57cc6a8d9..4be831f366 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -238,7 +238,7 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
         pointer_[0] = pointer_[1];
         pointer_[1] = tmp_pointer;
       }
-      contiguous_offset = contiguous_offset / 2;
+      contiguous_offset = contiguous_offset / 2 * 2;
     }
 
     int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
index e286ed1162..4d45ecf5dd 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
index a7e69816f1..5a82a702fa 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
index 64be655680..6fd783c6dd 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
index 824e207d74..c000dd6283 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/tile_iterator_planar_complex.h b/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
index a3050c4299..ef5767198d 100644
--- a/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
+++ b/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/half.h b/include/cutlass/half.h
index 3d0bd34724..5503f5b318 100644
--- a/include/cutlass/half.h
+++ b/include/cutlass/half.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/integer_subbyte.h b/include/cutlass/integer_subbyte.h
index df32042d0e..bd8a6a0108 100644
--- a/include/cutlass/integer_subbyte.h
+++ b/include/cutlass/integer_subbyte.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -144,7 +144,7 @@ struct integer_subbyte {
   /// Greater than
   CUTLASS_HOST_DEVICE
   bool operator>(integer_subbyte const &rhs) const {
-    return !(rhs < *this);
+    return !(*this <= rhs);
   }
 };
 
diff --git a/include/cutlass/kernel_launch.h b/include/cutlass/kernel_launch.h
index bd84a35781..cda1896c0f 100644
--- a/include/cutlass/kernel_launch.h
+++ b/include/cutlass/kernel_launch.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/layout.h b/include/cutlass/layout/layout.h
index 775357d125..4d78c4c45d 100644
--- a/include/cutlass/layout/layout.h
+++ b/include/cutlass/layout/layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/matrix.h b/include/cutlass/layout/matrix.h
index 0590492625..668245fcb7 100644
--- a/include/cutlass/layout/matrix.h
+++ b/include/cutlass/layout/matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/pitch_linear.h b/include/cutlass/layout/pitch_linear.h
index a6158b32a4..a44825c1d8 100644
--- a/include/cutlass/layout/pitch_linear.h
+++ b/include/cutlass/layout/pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/tensor.h b/include/cutlass/layout/tensor.h
index 7f608dcf76..1196b726eb 100644
--- a/include/cutlass/layout/tensor.h
+++ b/include/cutlass/layout/tensor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -149,11 +149,11 @@ class TensorNHWC {
     fast_divmod(w, tmp, w, int(stride_[0]), c_mul, c_shr);
     #else
 
-    n = int(index / (stride_[0] * stride_[1] * stride_[2]));
-    LongIndex residual = index % (stride_[0] * stride_[1] * stride_[2]);
+    n = int(index / stride_[2]);
+    LongIndex residual = index % stride_[2];
 
-    h = int(residual / (stride_[0] * stride_[1]));
-    residual = (residual % (stride_[0] * stride_[1]));
+    h = int(residual / stride_[1]);
+    residual = (residual % stride_[1]);
 
     w = int(residual / stride_[0]);
     c = int(residual % stride_[0]);
@@ -314,6 +314,15 @@ class TensorNCxHWx {
   CUTLASS_HOST_DEVICE
   TensorNCxHWx(Stride const &stride = Stride(0)): stride_(stride) { }
 
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNCxHWx(
+    typename Stride::Index stride_w,    ///< number of elements between adjacent W coordinates
+    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
+    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
+  ):
+    stride_(make_Coord(stride_w, stride_h, stride_n)) { }
+
   /// Helper returns a layout to a tightly packed tensor
   CUTLASS_HOST_DEVICE
   static TensorNCxHWx packed(TensorCoord const &extent) {
@@ -404,6 +413,15 @@ class TensorCxRSKx {
   CUTLASS_HOST_DEVICE
   TensorCxRSKx(Stride const &stride = Stride(0)): stride_(stride) { }
 
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorCxRSKx(
+    typename Stride::Index stride_w,    ///< number of elements between adjacent W coordinates
+    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
+    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
+  ):
+    stride_(make_Coord(stride_w, stride_h, stride_n)) { }
+
   /// Helper returns a layout to a tightly packed tensor
   CUTLASS_HOST_DEVICE
   static TensorCxRSKx packed(TensorCoord const &extent) {
@@ -529,6 +547,12 @@ class TensorNDHWC {
       LongIndex(stride_[3] * coord.n());
   }
 
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const {
+    return coord.contiguous() + LongIndex(coord.strided() * stride_[3]);
+  }
+  
   /// Returns the stride of the layout
   CUTLASS_HOST_DEVICE
   Stride stride() const {
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm70.h b/include/cutlass/layout/tensor_op_multiplicand_sm70.h
index 03f87db392..9d375e6c48 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm70.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm75.h b/include/cutlass/layout/tensor_op_multiplicand_sm75.h
index b52483355c..5f81c1dae6 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm75.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm80.h b/include/cutlass/layout/tensor_op_multiplicand_sm80.h
index e5963a2a80..5d2ffc5ffa 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm80.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/vector.h b/include/cutlass/layout/vector.h
index b54b6b3b18..126d30b2fe 100644
--- a/include/cutlass/layout/vector.h
+++ b/include/cutlass/layout/vector.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/matrix.h b/include/cutlass/matrix.h
index 5d05ee8994..971f125e45 100644
--- a/include/cutlass/matrix.h
+++ b/include/cutlass/matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/matrix_coord.h b/include/cutlass/matrix_coord.h
index b432665e8c..dcf25cc64f 100644
--- a/include/cutlass/matrix_coord.h
+++ b/include/cutlass/matrix_coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/matrix_shape.h b/include/cutlass/matrix_shape.h
index cb3118c2d6..5b672ebbc6 100644
--- a/include/cutlass/matrix_shape.h
+++ b/include/cutlass/matrix_shape.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h
index 766478e085..57f3984b41 100644
--- a/include/cutlass/numeric_conversion.h
+++ b/include/cutlass/numeric_conversion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -1359,6 +1359,60 @@ struct PreferredRoundingMode<tfloat32_t, float> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Packs predicates into an array.
+template <int N>
+struct PackPredicates {
+  using result_type = Array<uint1b_t, N>;
+
+  static_assert(!(N % 4), "Must pack predicates in a count that is a multiple of 4");
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(bool const predicates[]) {
+
+    result_type packed;
+    packed.clear();
+
+    int const kWordSize = 8;
+    uint8_t *bytes = reinterpret_cast<uint8_t *>(packed.data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int word_idx = (i / kWordSize);
+      int bit_idx = (i % kWordSize);
+
+      uint8_t mask = ((predicates[i] ? 1u : 0u) << bit_idx);
+      bytes[word_idx] = (bytes[word_idx] | mask);
+    }
+    return packed;
+  }
+};
+
+/// Packs predicates into an array
+template <int N>
+struct UnpackPredicates {
+  using result_type = Array<uint1b_t, N>;
+
+  static_assert(!(N % 4), "Must unpack predicates in a count that is a multiple of 4");
+
+  CUTLASS_HOST_DEVICE
+  void operator()(bool predicates[], result_type const &packed) {
+
+    int const kWordSize = 8;
+    uint8_t const *bytes = reinterpret_cast<uint8_t const *>(packed.data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int word_idx = (i / kWordSize);
+      int bit_idx = (i % kWordSize);
+
+      predicates[i] = bool((bytes[word_idx] >> bit_idx) & 0x1);
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/numeric_types.h b/include/cutlass/numeric_types.h
index 9479ccb08b..363997b620 100644
--- a/include/cutlass/numeric_types.h
+++ b/include/cutlass/numeric_types.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -38,7 +38,6 @@
 
 namespace cutlass {
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Defines the size of an element in bits
diff --git a/include/cutlass/platform/platform.h b/include/cutlass/platform/platform.h
index 826b3977fc..e9ccae2e7f 100644
--- a/include/cutlass/platform/platform.h
+++ b/include/cutlass/platform/platform.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/predicate_vector.h b/include/cutlass/predicate_vector.h
index 9293696225..6ef748fb2e 100644
--- a/include/cutlass/predicate_vector.h
+++ b/include/cutlass/predicate_vector.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/quaternion.h b/include/cutlass/quaternion.h
index aef35025d3..67e0634afb 100644
--- a/include/cutlass/quaternion.h
+++ b/include/cutlass/quaternion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/real.h b/include/cutlass/real.h
index 99af846b19..faa7d92d0d 100644
--- a/include/cutlass/real.h
+++ b/include/cutlass/real.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/reduction/device/reduce_split_k.h b/include/cutlass/reduction/device/reduce_split_k.h
index e3626f88c0..4c044a4cab 100644
--- a/include/cutlass/reduction/device/reduce_split_k.h
+++ b/include/cutlass/reduction/device/reduce_split_k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/reduction/kernel/reduce_split_k.h b/include/cutlass/reduction/kernel/reduce_split_k.h
index 586c90d86a..870b94b8ed 100644
--- a/include/cutlass/reduction/kernel/reduce_split_k.h
+++ b/include/cutlass/reduction/kernel/reduce_split_k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/reduction/thread/reduce.h b/include/cutlass/reduction/thread/reduce.h
index 698b174f95..a0f2d18fff 100644
--- a/include/cutlass/reduction/thread/reduce.h
+++ b/include/cutlass/reduction/thread/reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/reduction/thread/reduction_operators.h b/include/cutlass/reduction/thread/reduction_operators.h
index 6f9aeb6f32..3c29bf7dc3 100644
--- a/include/cutlass/reduction/thread/reduction_operators.h
+++ b/include/cutlass/reduction/thread/reduction_operators.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -35,6 +35,8 @@
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 namespace cutlass {
 namespace reduction {
 namespace thread {
@@ -97,6 +99,131 @@ struct ReduceAdd {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+namespace detail {
+
+/// Special handling for binary operators
+template <typename ReductionOp, typename Element, int N>
+struct VectorizeArrayOperation {
+
+  using ValueType = Array<Element, N>;
+
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(
+    ReductionOp const &reduction_op, 
+    ValueType const &lhs, 
+    ValueType const &rhs) const {
+
+    ValueType result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = reduction_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ReductionOp, typename Element, int N>
+struct ReduceArrayOperation {
+
+  using ArrayType = Array<Element, N>;
+
+  CUTLASS_HOST_DEVICE
+  Element operator()(
+    ReductionOp const &reduction_op, 
+    ArrayType const &array) const {
+
+    Element item = reduction_op(array[0], array[1]);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 2; i < N; ++i) {
+      item = reduction_op(item, array[i]);
+    }
+
+    return item;
+  }
+};
+
+template <int N>
+struct ReduceArrayOperation<logical_and<uint1b_t>, uint1b_t, N> {
+
+  using ArrayType = Array<uint1b_t, N>;
+
+  CUTLASS_HOST_DEVICE
+  uint1b_t operator()(
+    logical_and<uint1b_t> const &reduction_op, 
+    ArrayType const &array) const {
+
+    uint8_t const *ptr = reinterpret_cast<uint8_t const *>(&array);
+    bool item = false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int byte = 0; byte < (N + 7) / 8; ++byte) {
+      uint8_t bits = ptr[byte];
+      item = (item || !bits);
+    }
+
+    return uint1b_t(!item);
+  }
+};
+
+template <int N>
+struct ReduceArrayOperation<logical_or<uint1b_t>, uint1b_t, N> {
+
+  using ArrayType = Array<uint1b_t, N>;
+
+  CUTLASS_HOST_DEVICE
+  uint1b_t operator()(
+    logical_and<uint1b_t> const &reduction_op, 
+    ArrayType const &array) const {
+
+    uint8_t const *ptr = reinterpret_cast<uint8_t const *>(&array);
+    bool item = true;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int byte = 0; byte < (N + 7) / 8; ++byte) {
+      uint8_t bits = ptr[byte];
+      item = (item || bits);
+    }
+
+    return uint1b_t(item);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper function to infer template argument types
+template <typename ReductionOp, typename Element, int N>
+CUTLASS_HOST_DEVICE
+Array<Element, N> ApplyArrayOperator(
+  ReductionOp const &reduction_op,
+  Array<Element, N> const &lhs, 
+  Array<Element, N> const &rhs) {
+
+  VectorizeArrayOperation<ReductionOp, Element, N> vectorize_op;
+
+  return vectorize_op(reduction_op, lhs, rhs);
+}
+
+/// Helper to reduce an array
+template <typename ReductionOp, typename Element, int N>
+Element ReduceArray(ReductionOp const &reduction_op, Array<Element, N> const &array) {
+  ReduceArrayOperation<ReductionOp, Element, N> reduce_array_op;
+
+  return reduce_array_op(reduction_op, array);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace thread
 } // namespace reduction
 } // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/reduction/threadblock_swizzle.h b/include/cutlass/reduction/threadblock_swizzle.h
index 2419cdf6f5..943b818d16 100644
--- a/include/cutlass/reduction/threadblock_swizzle.h
+++ b/include/cutlass/reduction/threadblock_swizzle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h
index 3d6a43b952..d75959152b 100644
--- a/include/cutlass/relatively_equal.h
+++ b/include/cutlass/relatively_equal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/semaphore.h b/include/cutlass/semaphore.h
index dc5523dca1..87f519053e 100644
--- a/include/cutlass/semaphore.h
+++ b/include/cutlass/semaphore.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/subbyte_reference.h b/include/cutlass/subbyte_reference.h
index 6f7aab2c6d..950c8da4ee 100644
--- a/include/cutlass/subbyte_reference.h
+++ b/include/cutlass/subbyte_reference.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -358,6 +358,12 @@ class SubbyteReference {
     return ptr_;
   }
 
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  Element * operator&() const {
+    return reinterpret_cast<Element *>(ptr_);
+  }
+
   /// Gets element offset within storage vector
   CUTLASS_HOST_DEVICE
   int element_offset() const {
diff --git a/include/cutlass/tensor_coord.h b/include/cutlass/tensor_coord.h
index b60bc11262..5c0c603171 100644
--- a/include/cutlass/tensor_coord.h
+++ b/include/cutlass/tensor_coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/tensor_ref.h b/include/cutlass/tensor_ref.h
index a805107c3d..2782b49fc0 100644
--- a/include/cutlass/tensor_ref.h
+++ b/include/cutlass/tensor_ref.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/tensor_ref_planar_complex.h b/include/cutlass/tensor_ref_planar_complex.h
index 54611911ca..009608db8d 100644
--- a/include/cutlass/tensor_ref_planar_complex.h
+++ b/include/cutlass/tensor_ref_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/tensor_view.h b/include/cutlass/tensor_view.h
index fdbee1055e..333c559af9 100644
--- a/include/cutlass/tensor_view.h
+++ b/include/cutlass/tensor_view.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -219,7 +219,9 @@ class TensorView : public TensorRef<Element_, Layout_> {
     TensorCoord const& location = TensorCoord()       ///< resulting view's origin within the old view
   ) const {
 
-    return TensorView(ref(), extent.clamp(extent_ - location)).add_coord_offset(location);
+    TensorView result(this->ref(), extent.clamp(extent_ - location));
+    result.add_coord_offset(location);
+    return result;
   }
 
   /// Returns the number of scalar elements needed to store tensor.
diff --git a/include/cutlass/tensor_view_planar_complex.h b/include/cutlass/tensor_view_planar_complex.h
index bdd29829da..80d32f1c00 100644
--- a/include/cutlass/tensor_view_planar_complex.h
+++ b/include/cutlass/tensor_view_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -208,7 +208,9 @@ class TensorViewPlanarComplex : public TensorRefPlanarComplex<Element_, Layout_>
     TensorCoord const& location = TensorCoord()       ///< resulting view's origin within the old view
   ) const {
 
-    return TensorViewPlanarComplex(ref(), extent.clamp(extent_ - location)).add_coord_offset(location);
+    TensorViewPlanarComplex result(this->ref(), extent.clamp(extent_ - location));
+    result.add_coord_offset(location);
+    return result; 
   }
 
   /// Returns the number of scalar elements needed to store tensor.
diff --git a/include/cutlass/tfloat32.h b/include/cutlass/tfloat32.h
index 2d28851299..67a7f1c7b0 100644
--- a/include/cutlass/tfloat32.h
+++ b/include/cutlass/tfloat32.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/thread/matrix.h b/include/cutlass/thread/matrix.h
index a54b347150..a7ffa6b5ab 100644
--- a/include/cutlass/thread/matrix.h
+++ b/include/cutlass/thread/matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/trace.h b/include/cutlass/trace.h
index 39ffa2968c..62df598dd6 100644
--- a/include/cutlass/trace.h
+++ b/include/cutlass/trace.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/pitch_linear_thread_map.h b/include/cutlass/transform/pitch_linear_thread_map.h
index c19f79cbbc..11285014af 100644
--- a/include/cutlass/transform/pitch_linear_thread_map.h
+++ b/include/cutlass/transform/pitch_linear_thread_map.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/thread/unaryOp.h b/include/cutlass/transform/thread/unaryOp.h
index de4f79b972..6434db54f6 100644
--- a/include/cutlass/transform/thread/unaryOp.h
+++ b/include/cutlass/transform/thread/unaryOp.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
index 7dce3228ec..a6bdca8f21 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
@@ -1,40 +1,39 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- *modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *notice, this list of conditions and the following disclaimer in the
- *documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its
- *contributors may be used to endorse or promote products derived from this
- *software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
- *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
- *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.                                 
+ *                                                                                                    
+ * Redistribution and use in source and binary forms, with or without modification, are permitted     
+ * provided that the following conditions are met:                                                    
+ *     * Redistributions of source code must retain the above copyright notice, this list of          
+ *       conditions and the following disclaimer.                                                     
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of       
+ *       conditions and the following disclaimer in the documentation and/or other materials          
+ *       provided with the distribution.                                                              
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used     
+ *       to endorse or promote products derived from this software without specific prior written     
+ *       permission.                                                                                  
+ *                                                                                                    
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR     
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND   
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE    
+ * FOR ANY DIRECT,INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,      
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;    
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,      
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE     
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  *
  **************************************************************************************************/
 /*! \file
     \brief Templates calculating the address and predicates to the load of tiles
-   from pitch-linear rank=2 tensors.
+    from pitch-linear rank=2 tensors.
 
-    This iterator uses masks to guard out-of-bounds accesses and visits the last
-   "residue" tile first, with the objective of minimizing predicate mask updates
-   during steady-state operation.
+    This iterator uses masks to guard out-of-bounds accesses. The first tile this
+    iterator visits maybe partial, then the remaining tiles are complete. So, we 
+    only need to compute the predicates twice, once before the first tile and 
+    once for the remaining full tiles which can share the same predicates.
 
     A precomputed "Params" object minimizes the amount of state that must be
-   stored in registers, and integer addition is used to advance the pointer
-   through memory.
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
 */
 
 #pragma once
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
index 97ab909c74..278766fd36 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
@@ -1,27 +1,25 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- *modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *notice, this list of conditions and the following disclaimer in the
- *documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its
- *contributors may be used to endorse or promote products derived from this
- *software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
- *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
- *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.                                 
+ *                                                                                                    
+ * Redistribution and use in source and binary forms, with or without modification, are permitted     
+ * provided that the following conditions are met:                                                    
+ *     * Redistributions of source code must retain the above copyright notice, this list of          
+ *       conditions and the following disclaimer.                                                     
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of       
+ *       conditions and the following disclaimer in the documentation and/or other materials          
+ *       provided with the distribution.                                                              
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used     
+ *       to endorse or promote products derived from this software without specific prior written     
+ *       permission.                                                                                  
+ *                                                                                                    
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR     
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND   
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE    
+ * FOR ANY DIRECT,INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,      
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;    
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,      
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE     
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  *
  **************************************************************************************************/
 /*! \file
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_iterator.h
index 48d25ef42a..603d1fcb95 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -25,8 +25,10 @@
 /*! \file
     \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
 
-    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
-    first, with the objective of minimizing predicate mask updates during steady-state operation.
+    This iterator uses masks to guard out-of-bounds accesses. The first tile this
+    iterator visits maybe partial, then the remaining tiles are complete. So, we 
+    only need to compute the predicates twice, once before the first tile and 
+    once for the remaining full tiles which can share the same predicates.
 
     A precomputed "Params" object minimizes the amount of state that must be stored in registers,
     and integer addition is used to advance the pointer through memory.
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
index 0342a43464..9895e74c7a 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
index 0d775dffba..3541cd752e 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  *modification, are permitted provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
index 31f529e004..ec30b48ed8 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -182,7 +182,15 @@ class RegularTileAccessIterator<
     return prev;
   }
 
-  /// Adds a tile offset
+  /// Adds a tile offset in the unit of tile.
+  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
+  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
+  ///   For row major A operand, k dimension is contiguous dimension;
+  ///   For col major A operand, k dimension is strided dimension;
+  ///   For row major B operand, k dimension is strided dimension;
+  ///   For col major B operand, k dimension is contiguous dimension.
+  /// Below two classes map col/row major to the pitch linear coordinates used
+  /// in this base class.
   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     add_pointer_offset(coord.contiguous() * Shape::kContiguous +
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
index 32043130bd..e0c44b1c48 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
index 5a0c74fdc6..5861ca687c 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator.h b/include/cutlass/transform/threadblock/regular_tile_iterator.h
index d7928ac00a..e1978f361d 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
index 2dcd57d658..831131f0d7 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -227,14 +227,21 @@ class RegularTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank, Th
     pointer_ += pointer_offset;
   }
 
-  /// Adds a tile offset
+  /// Adds a tile offset in the unit of tile.
+  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
+  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
+  ///   For row major A operand, k dimension is contiguous dimension;
+  ///   For col major A operand, k dimension is strided dimension;
+  ///   For row major B operand, k dimension is strided dimension;
+  ///   For col major B operand, k dimension is contiguous dimension.
+  /// Below two classes map col/row major to the pitch linear coordinates used
+  /// in this base class.
   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     int offset = sizeof_bits<Element>::value *
         (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8;
     add_pointer_offset(offset);
   }
-
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
index 85d702fec6..abfba6b8b4 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
index c7f0690779..c35f131437 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
index 82c8842ec0..0d2bbeea4f 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/uint128.h b/include/cutlass/uint128.h
new file mode 100644
index 0000000000..cfcb696e4d
--- /dev/null
+++ b/include/cutlass/uint128.h
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file
+  \brief Defines an unsigned 128b integer with several operators to support 64-bit integer division.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <cstdint>
+#include <cmath>
+#include <type_traits>
+#include <stdexcept>
+#endif
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Optionally enable GCC's built-in type
+#if defined(__x86_64) && !defined(__CUDA_ARCH__)
+#if defined(__GNUC__)
+#define CUTLASS_UINT128_NATIVE
+#elif defined(_MSC_VER)
+#define CUTLASS_INT128_ARITHMETIC
+#include <intrin.h>
+#endif
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+///! Unsigned 128b integer type
+struct uint128_t {
+
+  /// Size of one part of the uint's storage in bits
+  int const kPartSize = sizeof(uint64_t) * 8;
+
+  // Use a union to store either low and high parts or, if present, a built-in 128b integer type.
+  union {
+
+    struct {
+      uint64_t lo;
+      uint64_t hi;
+    };
+
+    #if defined(CUTLASS_UINT128_NATIVE)
+    unsigned __int128 native;
+    #endif // defined(CUTLASS_UINT128_NATIVE)
+  };
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  uint128_t(): lo(0), hi(0) { }
+
+  /// Constructor from uint64
+  CUTLASS_HOST_DEVICE
+  uint128_t(uint64_t lo_): lo(lo_), hi(0) { }
+
+  /// Constructor from two 64b unsigned integers
+  CUTLASS_HOST_DEVICE
+  uint128_t(uint64_t lo_, uint64_t hi_): lo(lo_), hi(hi_) {
+
+  }
+
+  /// Optional constructor from native value
+  #if defined(CUTLASS_UINT128_NATIVE)
+  uint128_t(unsigned __int128 value): native(value) { }
+  #endif
+
+  /// Lossily cast to uint64
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return lo;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void exception() {
+#if defined(__CUDA_ARCH__)
+  asm volatile ("  brkpt;\n");
+#else
+  throw std::runtime_error("Not yet implemented.");
+#endif
+  }
+
+  /// Add
+  CUTLASS_HOST_DEVICE
+  uint128_t operator+(uint128_t const &rhs) const {
+    uint128_t y;
+#if defined(CUTLASS_UINT128_NATIVE)
+    y.native = native + rhs.native;
+#else
+    y.lo = lo + rhs.lo;
+    y.hi = hi + rhs.hi + (!y.lo && (rhs.lo));
+#endif
+    return y;
+  }
+
+  /// Subtract
+  CUTLASS_HOST_DEVICE
+  uint128_t operator-(uint128_t const &rhs) const {
+    uint128_t y;
+#if defined(CUTLASS_UINT128_NATIVE)
+    y.native = native - rhs.native;
+#else
+    y.lo = lo - rhs.lo;
+    y.hi = hi - rhs.hi - (rhs.lo && y.lo > lo);
+#endif
+    return y;
+  }
+
+  /// Multiply by unsigned 64b integer yielding 128b integer
+  CUTLASS_HOST_DEVICE
+  uint128_t operator*(uint64_t const &rhs) const {
+    uint128_t y;
+#if defined(CUTLASS_UINT128_NATIVE)
+    y.native = native * rhs;
+#elif defined(CUTLASS_INT128_ARITHMETIC)
+    // Multiply by the low part
+    y.lo = _umul128(lo, rhs, &y.hi);
+
+    // Add the high part and ignore the overflow
+    uint64_t overflow;
+    y.hi += _umul128(hi, rhs, &overflow);
+#else
+    // TODO - not implemented
+    exception();
+#endif
+    return y;
+  }
+
+  /// Divide 128b operation by 64b operation yielding a 64b quotient
+  CUTLASS_HOST_DEVICE
+  uint64_t operator/(uint64_t const &divisor) const {
+    uint64_t quotient = 0;
+#if defined(CUTLASS_UINT128_NATIVE)
+    quotient = uint64_t(native / divisor);
+#elif defined(CUTLASS_INT128_ARITHMETIC)
+    // implemented using MSVC's arithmetic intrinsics
+    uint64_t remainder = 0;
+    quotient = _udiv128(hi, lo, divisor, &remainder);
+#else
+    // TODO - not implemented
+    exception();
+#endif
+    return quotient;
+  }
+
+  /// Divide 128b operation by 64b operation yielding a 64b quotient
+  CUTLASS_HOST_DEVICE
+  uint64_t operator%(uint64_t const &divisor) const {
+    uint64_t remainder = 0;
+#if defined(CUTLASS_UINT128_NATIVE)
+    remainder = uint64_t(native % divisor);
+#elif defined(CUTLASS_INT128_ARITHMETIC)
+    // implemented using MSVC's arithmetic intrinsics
+    (void)_udiv128(hi, lo, divisor, &remainder);
+#else
+    // TODO - not implemented
+    exception();
+#endif
+    return remainder;
+  }
+
+  /// Computes the quotient and remainder in a single method.
+  CUTLASS_HOST_DEVICE
+  uint64_t divmod(uint64_t &remainder, uint64_t divisor) const {
+    uint64_t quotient = 0;
+#if defined(CUTLASS_UINT128_NATIVE)
+    quotient = uint64_t(native / divisor);
+    remainder = uint64_t(native % divisor);
+#elif defined(CUTLASS_INT128_ARITHMETIC)
+    // implemented using MSVC's arithmetic intrinsics
+    quotient = _udiv128(hi, lo, divisor, &remainder);
+#else
+    // TODO - not implemented
+    exception();
+#endif
+    return quotient;
+  }
+
+  /// Left-shifts a 128b unsigned integer
+  CUTLASS_HOST_DEVICE
+  uint128_t operator<<(int sh) const {
+    if (sh == 0) {
+      return *this;
+    }
+    else if (sh >= kPartSize) {
+      return uint128_t(0, lo << (sh - kPartSize));
+    }
+    else {
+      return uint128_t(
+        (lo << sh),
+        (hi << sh) | uint64_t(lo >> (kPartSize - sh))
+      );
+    }
+  }
+
+  /// Right-shifts a 128b unsigned integer
+  CUTLASS_HOST_DEVICE
+  uint128_t operator>>(int sh) const {
+    if (sh == 0) {
+      return *this;
+    }
+    else if (sh >= kPartSize) {
+      return uint128_t((hi >> (sh - kPartSize)), 0);
+    }
+    else {
+      return uint128_t(
+        (lo >> sh) | (hi << (kPartSize - sh)),
+        (hi >> sh)
+      );
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/wmma_array.h b/include/cutlass/wmma_array.h
index e80961394d..37e87430a1 100644
--- a/include/cutlass/wmma_array.h
+++ b/include/cutlass/wmma_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/media/docs/code_organization.md b/media/docs/code_organization.md
index 9a00d3056f..55984d9bb1 100644
--- a/media/docs/code_organization.md
+++ b/media/docs/code_organization.md
@@ -220,7 +220,7 @@ of tests run may vary over time as more are added.
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/doxygen_mainpage.md b/media/docs/doxygen_mainpage.md
index 15656d25e5..6ff2575b9a 100644
--- a/media/docs/doxygen_mainpage.md
+++ b/media/docs/doxygen_mainpage.md
@@ -120,7 +120,7 @@ cudaError_t cutlass_sgemm_nn(
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/efficient_gemm.md b/media/docs/efficient_gemm.md
index 7a1a6ae7f4..a8374fd8a2 100644
--- a/media/docs/efficient_gemm.md
+++ b/media/docs/efficient_gemm.md
@@ -225,7 +225,7 @@ targeting NVIDIA GPUs.
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/functionality.md b/media/docs/functionality.md
index aeb9bcf3b6..c5570750e3 100644
--- a/media/docs/functionality.md
+++ b/media/docs/functionality.md
@@ -249,7 +249,7 @@ CUDA exposes warp-level matrix operations in the CUDA C++ WMMA API. The CUDA C++
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/fundamental_types.md b/media/docs/fundamental_types.md
index 7556cd45dc..40f5f0810f 100644
--- a/media/docs/fundamental_types.md
+++ b/media/docs/fundamental_types.md
@@ -346,7 +346,7 @@ support on current and future NVIDIA GPUs.
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/gemm_api.md b/media/docs/gemm_api.md
index fec32a0451..2c268fdc7b 100644
--- a/media/docs/gemm_api.md
+++ b/media/docs/gemm_api.md
@@ -541,7 +541,7 @@ to inline PTX.
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/implicit_gemm_convolution.md b/media/docs/implicit_gemm_convolution.md
index 5cc0a258e5..c564eb61e4 100644
--- a/media/docs/implicit_gemm_convolution.md
+++ b/media/docs/implicit_gemm_convolution.md
@@ -754,7 +754,7 @@ Convolution can also be run by the CUTLASS Profiler.
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/layout.md b/media/docs/layout.md
index bacec0e442..0de2751211 100644
--- a/media/docs/layout.md
+++ b/media/docs/layout.md
@@ -267,7 +267,7 @@ Permuted Shared Memory Layouts:
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/profiler.md b/media/docs/profiler.md
index c7ce91a7ca..4d04f40b42 100644
--- a/media/docs/profiler.md
+++ b/media/docs/profiler.md
@@ -501,7 +501,7 @@ reference_device: Passed
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/programming_guidelines.md b/media/docs/programming_guidelines.md
index 0cf7ea257f..e87d93ba66 100644
--- a/media/docs/programming_guidelines.md
+++ b/media/docs/programming_guidelines.md
@@ -292,7 +292,7 @@ Github's pretty printer.
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md
index f283da8a3b..333ea07346 100644
--- a/media/docs/quickstart.md
+++ b/media/docs/quickstart.md
@@ -537,7 +537,7 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=tensorop*s*
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/terminology.md b/media/docs/terminology.md
index 07464143cb..e41a655569 100644
--- a/media/docs/terminology.md
+++ b/media/docs/terminology.md
@@ -74,7 +74,7 @@ contiguous and strided dimensions of a tile.
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/tile_iterator_concept.md b/media/docs/tile_iterator_concept.md
index 061ff90734..c4a3962b54 100644
--- a/media/docs/tile_iterator_concept.md
+++ b/media/docs/tile_iterator_concept.md
@@ -466,7 +466,7 @@ struct WriteableReadableRandomAccessContiguousTileIteratorConcept {
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/utilities.md b/media/docs/utilities.md
index b9ddc79a70..fc4ac8ca1b 100644
--- a/media/docs/utilities.md
+++ b/media/docs/utilities.md
@@ -379,7 +379,7 @@ int main() {
 
 # Copyright
 
-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/images/13_example_block_resident_fusion.png b/media/images/13_example_block_resident_fusion.png
new file mode 100755
index 0000000000..736857b946
Binary files /dev/null and b/media/images/13_example_block_resident_fusion.png differ
diff --git a/media/images/13_example_fusion.png b/media/images/13_example_fusion.png
new file mode 100755
index 0000000000..142c8d04a7
Binary files /dev/null and b/media/images/13_example_fusion.png differ
diff --git a/media/images/13_example_rf_resident_fusion.png b/media/images/13_example_rf_resident_fusion.png
new file mode 100755
index 0000000000..dc2786f756
Binary files /dev/null and b/media/images/13_example_rf_resident_fusion.png differ
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 436990fd66..55542b5367 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index d57570ce6c..7954c53e3e 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/common/cutlass_unit_test.h b/test/unit/common/cutlass_unit_test.h
index 81908265fa..83e930da82 100644
--- a/test/unit/common/cutlass_unit_test.h
+++ b/test/unit/common/cutlass_unit_test.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -59,3 +59,7 @@ void FilterArchitecture();
 #define CUTLASS_TEST_L1(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(1,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
 #define CUTLASS_TEST_L2(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(2,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
 #endif
+
+#if !defined(CUTLASS_TEST_UNIT_ENABLE_WARNINGS)
+#define CUTLASS_TEST_UNIT_ENABLE_WARNINGS false
+#endif
diff --git a/test/unit/common/filter_architecture.cpp b/test/unit/common/filter_architecture.cpp
index 0c548bdf86..5e13354b6e 100644
--- a/test/unit/common/filter_architecture.cpp
+++ b/test/unit/common/filter_architecture.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/CMakeLists.txt b/test/unit/conv/CMakeLists.txt
index a50a58f59e..c2840838f2 100644
--- a/test/unit/conv/CMakeLists.txt
+++ b/test/unit/conv/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/conv/device/CMakeLists.txt b/test/unit/conv/device/CMakeLists.txt
index ce907e0d58..1578625686 100644
--- a/test/unit/conv/device/CMakeLists.txt
+++ b/test/unit/conv/device/CMakeLists.txt
@@ -20,32 +20,73 @@
 # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
- add_custom_target(
+list(SORT CUTLASS_NVCC_ARCHS_ENABLED)
+set(CUTLASS_NVCC_ARCHS_ENABLED_REVERSED ${CUTLASS_NVCC_ARCHS_ENABLED})
+list(REVERSE CUTLASS_NVCC_ARCHS_ENABLED_REVERSED)
+list(GET CUTLASS_NVCC_ARCHS_ENABLED_REVERSED 0 CUTLASS_NVCC_MAX_ARCH)
+
+add_custom_target(
   cutlass_test_unit_conv_device
   DEPENDS
   cutlass_test_unit_conv_device_simt
-  cutlass_test_unit_conv_device_tensorop_f32_sm70
-  cutlass_test_unit_conv_device_tensorop_f32_sm75
-  cutlass_test_unit_conv_device_tensorop_f16_sm80
-  cutlass_test_unit_conv_device_tensorop_f32_sm80
-  cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80
-  cutlass_test_unit_conv_device_tensorop_s32
-  cutlass_test_unit_conv_device_tensorop_s32_interleaved
 )
 
  add_custom_target(
   test_unit_conv_device
   DEPENDS
   test_unit_conv_device_simt
-  test_unit_conv_device_tensorop_f32_sm70
-  test_unit_conv_device_tensorop_f32_sm75
-  test_unit_conv_device_tensorop_f16_sm80
-  test_unit_conv_device_tensorop_f32_sm80
-  test_unit_conv_device_tensorop_f32_tf32_sm80
-  test_unit_conv_device_tensorop_s32
-  test_unit_conv_device_tensorop_s32_interleaved
 )
 
+if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 70)
+
+  add_dependencies(
+    cutlass_test_unit_conv_device
+    cutlass_test_unit_conv_device_tensorop_f32_sm70
+  )
+
+  add_dependencies(
+    test_unit_conv_device
+    test_unit_conv_device_tensorop_f32_sm70
+  )
+
+endif()
+
+if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 75)
+
+  add_dependencies(
+    cutlass_test_unit_conv_device
+    cutlass_test_unit_conv_device_tensorop_f32_sm75
+    cutlass_test_unit_conv_device_tensorop_s32
+    cutlass_test_unit_conv_device_tensorop_s32_interleaved
+  )
+
+  add_dependencies(
+    test_unit_conv_device
+    test_unit_conv_device_tensorop_f32_sm75
+    test_unit_conv_device_tensorop_s32
+    test_unit_conv_device_tensorop_s32_interleaved
+  )
+
+endif()
+
+if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80)
+
+  add_dependencies(
+    cutlass_test_unit_conv_device
+    cutlass_test_unit_conv_device_tensorop_f16_sm80
+    cutlass_test_unit_conv_device_tensorop_f32_sm80
+    cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80
+  )
+
+  add_dependencies(
+    test_unit_conv_device
+    test_unit_conv_device_tensorop_f16_sm80
+    test_unit_conv_device_tensorop_f32_sm80
+    test_unit_conv_device_tensorop_f32_tf32_sm80
+  )
+
+endif()
+
 #
 # OpClassSimt (CUDA cores)
 #
@@ -56,20 +97,27 @@ cutlass_test_unit_add_executable(
   # F32  
   conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
 
-  conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
-  conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
-  conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
-
   # CF32
   conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
   conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
   conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu 
-  
-  conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
-  conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
-  conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
 )
 
+if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80)
+
+  cutlass_target_sources(
+    cutlass_test_unit_conv_device_simt
+    PRIVATE
+    conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+    conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+    conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+    conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+    conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+    conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+  )
+
+endif()
+
 #
 # OpClassTensorOp (Tensor cores)
 #
@@ -92,57 +140,81 @@ cutlass_test_unit_add_executable(
   conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
 )
 
-# Conv2d - F16 input, F16 output, F16 accumulation 
-cutlass_test_unit_add_executable(
-  cutlass_test_unit_conv_device_tensorop_f16_sm80
-
-  conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
-  conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu 
-  conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu 
-)
-
-# Conv2d - F16 input, F32 output, F32 accumulation
-cutlass_test_unit_add_executable(
-  cutlass_test_unit_conv_device_tensorop_f32_sm80
-
-
-  conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
-  conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
-  conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
-
-  conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
-  conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
-)
-
-# Conv2d - TF32 input, F32 output, F32 accumulation
-cutlass_test_unit_add_executable(
-  cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80
-
-  conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
-  conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
-  conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80)
+  
+  # Conv2d - F16 input, F16 output, F16 accumulation 
+  cutlass_test_unit_add_executable(
+    cutlass_test_unit_conv_device_tensorop_f16_sm80
+  
+    conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+    conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu 
+    conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu 
+  )
 
-  conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
-  conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
-  conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
-)
+  # Conv2d - F16 input, F32 output, F32 accumulation
+  
+  cutlass_test_unit_add_executable(
+    cutlass_test_unit_conv_device_tensorop_f32_sm80
+  
+    conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+    conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+    conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+  
+    conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
+    conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+  )
+  
+  # Conv2d - TF32 input, F32 output, F32 accumulation
+  
+  cutlass_test_unit_add_executable(
+    cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80
+  
+    conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+    conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+    conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+  
+    conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+    conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+    conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+  )
 
-# Conv2d - S8 input, S32 output, S32 accumulation
-cutlass_test_unit_add_executable(
-  cutlass_test_unit_conv_device_tensorop_s32
+endif()
 
-  conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
-  conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
-  conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
-  conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
-)
+if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 75)
 
-# Conv2d - S8 interleaved input, S8 interleaved output, S32 accumulation
-cutlass_test_unit_add_executable(
-  cutlass_test_unit_conv_device_tensorop_s32_interleaved
+  # Conv2d - S8 input, S32 output, S32 accumulation
 
-  conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
-  conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
-  conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
-  conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
-)
+  cutlass_test_unit_add_executable(
+    cutlass_test_unit_conv_device_tensorop_s32
+    conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
+    conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
+  )
+  
+  # Conv2d - S8 interleaved input, S8 interleaved output, S32 accumulation
+
+  cutlass_test_unit_add_executable(
+    cutlass_test_unit_conv_device_tensorop_s32_interleaved  
+    conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
+    conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
+    )
+
+  if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80)
+
+    cutlass_target_sources(
+      cutlass_test_unit_conv_device_tensorop_s32
+      PRIVATE
+      conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
+      conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
+    )
+    
+    # Conv2d - S8 interleaved input, S8 interleaved output, S32 accumulation
+    cutlass_target_sources(
+      cutlass_test_unit_conv_device_tensorop_s32_interleaved
+      PRIVATE
+      conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
+      conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
+    )
+
+  endif()
+
+endif()
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index 4d500d9783..ba53d6f727 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index cc36edc75e..dc3f9d5062 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index aab0d34e49..e3eb0736d8 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index bc9ee6e9d7..ff512c02b2 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index 7417f92197..212290cb8b 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 01f51a2cc4..b1fc52f4d6 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index 7682a319fe..542e1e6b96 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 48c6ddb043..262e221e9f 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index b3b66a9de1..9ef2c7f640 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index 25e3ee0d5f..baece322a8 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index e151f5a78f..3366f1b541 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
index 4c8102a503..72026a7e8c 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index 15f5585839..5332bea683 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index b54359f177..7b74e1284b 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 51d2b942f4..46c366b76a 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
index 820f0fb89f..78885e2a4e 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index 746e7d7b0b..3637fe8c9f 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
index 7255eac644..b9fa943f07 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
index 7e9bb9060b..a343aed586 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
index 5426003779..38f6c6fbe7 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
index d0ba7a5047..0bdd99bd0c 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
index fbab373165..5893564830 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
index e8b7c44fe2..021dc9a39f 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
index e5146be328..43f6c0965b 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
index 4cfdd3722d..2446c4aa99 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index c1a1f647a3..4c7b3d77df 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_problems.h b/test/unit/conv/device/conv2d_problems.h
index 74b43e11c7..c532894e9b 100644
--- a/test/unit/conv/device/conv2d_problems.h
+++ b/test/unit/conv/device/conv2d_problems.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -165,6 +165,22 @@ struct TestbedConv2dProblemSizes {
     // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
     ////////////////////////////////////////////////////////////////////////////////////////////
     
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 1, 1, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 1, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
     conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
       {1, 8, 8, minimum_channel_size},   // input size  (NHWC)
       {8, 3, 3, minimum_channel_size},   // filter size (KRSC)
@@ -322,7 +338,7 @@ struct TestbedConv2dProblemSizes {
       {1, 1},             // dilation (dilation_h, dilation_w)
       {4, 1, 1, 328}      // output size (NPQK)
     ));
-  
+    
   }
 
 
diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h
index 14bdd9bf13..9b94a4db61 100644
--- a/test/unit/conv/device/conv2d_testbed.h
+++ b/test/unit/conv/device/conv2d_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -204,10 +204,13 @@ class TestbedConv2d {
     ElementCompute alpha = ElementCompute(1),
     ElementCompute beta = ElementCompute(0)) {
 
-		// Waive test if CUDA device is insufficient
-		if (!sufficient()) {
-			return true;
-		}
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
 
 #if 0 //display conv2d problem size for debugging
     std::cout << problem_size << std::endl
diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h
index cb4ecc7056..06ab207d14 100644
--- a/test/unit/conv/device/conv2d_testbed_interleaved.h
+++ b/test/unit/conv/device/conv2d_testbed_interleaved.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index 07961dd2b7..dbc5533225 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index a68a30fe5b..6cf9b15fb7 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index 3cbde02888..a27265143d 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index ffb79d77ad..bd49794eab 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index 1101090a12..14b93a3621 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index ade6f8df32..ca74be4d7e 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index a0aac81147..5645c90de7 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 2185257f15..d67d54290c 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..e57d61170c
--- /dev/null
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,120 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+TEST(SM80_Device_Conv3d_Dgrad_Analytic_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x4_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv3dDgradKernel = typename cutlass::conv::kernel::DefaultConv3dDgrad<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv3dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_Conv3d_Dgrad_Optimized_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x4_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv3dDgradKernel = typename cutlass::conv::kernel::DefaultConv3dDgrad<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv3dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
+
diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 211a331d8b..dfb64ce363 100644
--- a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -76,5 +76,46 @@ TEST(SM80_Device_Conv3d_Dgrad_Analytic_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc
   EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dDgrad>());
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv3d_Dgrad_Optimized_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv3dDgradKernel = typename cutlass::conv::kernel::DefaultConv3dDgrad<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv3dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dDgrad>());
+}
 ////////////////////////////////////////////////////////////////////////////////
 #endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
new file mode 100644
index 0000000000..d5abb46e72
--- /dev/null
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv3d_Fprop_Analytic_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv3dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..b89485e2cb
--- /dev/null
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,159 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+TEST(SM80_Device_Conv3d_Fprop_Analytic_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x4_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv3dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_Conv3d_Fprop_Optimized_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x4_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv3dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv3d_Fprop_Optimized_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32,
+  64x256_32x4_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv3dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
+
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 0aabef5ba6..9a5c21eafc 100644
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -76,5 +76,46 @@ TEST(SM80_Device_Conv3d_Fprop_Analytic_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc
   EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dFprop>());
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv3d_Fprop_Optimized_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv3dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dFprop>());
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 #endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv3d_problems.h b/test/unit/conv/device/conv3d_problems.h
index 9cc618467e..21dc4b4f07 100644
--- a/test/unit/conv/device/conv3d_problems.h
+++ b/test/unit/conv/device/conv3d_problems.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -107,9 +107,25 @@ struct TestbedConv3dProblemSizes {
     ));
 
     conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
-      {1, 1, 16, 16, minimum_channel_size}, // input size  (NDHWC)
-      {8, 1, 3, 3, minimum_channel_size},   // filter size (KTRSC)
-      cutlass::Coord<3>({0, 1, 1}),         // padding (pad_d, pad_h, pad_w)
+      {1, 1, 1, 8, minimum_channel_size}, // input size  (NDHWC)
+      {8, 1, 1, 3, minimum_channel_size},   // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),         // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 8, 8, 8, minimum_channel_size}, // input size  (NDHWC)
+      {8, 3, 3, 3, minimum_channel_size},   // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),         // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 16, 16, 16, minimum_channel_size}, // input size  (NDHWC)
+      {8, 3, 3, 3, minimum_channel_size},   // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),         // padding (pad_d, pad_h, pad_w)
       cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
       cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w) 
     ));
@@ -138,6 +154,7 @@ struct TestbedConv3dProblemSizes {
       cutlass::Coord<3>({1, 1, 1})         // dilation (dilation_d, dilation_h, dilation_w) 
     ));
 
+
     conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
       {1, 11, 15, 19, 64},              // input size  (NDHWC)
       {32, 4, 3, 6, 64},                // filter size (KTRSC)
diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h
index 179520d158..87ac39abb7 100644
--- a/test/unit/conv/device/conv3d_testbed.h
+++ b/test/unit/conv/device/conv3d_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -204,10 +204,14 @@ class TestbedConv3d {
     ElementCompute alpha = ElementCompute(1),
     ElementCompute beta = ElementCompute()) {
 
-		// Waive test if CUDA device is insufficient.
-		if (!sufficient()) {
-			return true;
-		}
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
 
 #if 0 //display conv2d problem size for debugging
     std::cout << problem_size << std::endl
@@ -413,11 +417,6 @@ bool TestAllConv3d(
   //
   TestbedConv3dProblemSizes conv3d_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
 
-  //
-  // Get conv problem sizes to run conv operator 
-  //
-  //TestbedConv3dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
-
   // Vector of conv3d problem sizes to avoid duplicate runs
   Conv3dProblemVector conv_tested_sizes;
 
@@ -443,12 +442,17 @@ bool TestAllConv3d(
       // Procedurally disable certain cases
       //
   
-      // CUTLASS DGRAD's unity stride specialization only support stride {1, 1} 
+      // CUTLASS DGRAD's unity stride specialization only support stride {1, 1, 1} 
       if ((ImplicitGemm::kConvolutionalOperator == 
             cutlass::conv::Operator::kDgrad) && 
-          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
-            cutlass::conv::StrideSupport::kUnity)) {
-        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          ((ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity) ||
+           (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorB::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity))) {
+        if (!((conv_problem.stride_d == 1) &&
+              (conv_problem.stride_h == 1) && 
+              (conv_problem.stride_w == 1))
+          ) {
           continue;
         }
       }
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
index a3f8409447..e706f1dae7 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 9847aede81..89167ce384 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 6dcbf0e726..477762fd1d 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/CMakeLists.txt b/test/unit/core/CMakeLists.txt
index 19282035f5..ade17ae0bd 100644
--- a/test/unit/core/CMakeLists.txt
+++ b/test/unit/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/core/array.cu b/test/unit/core/array.cu
index 5a8cc855b0..bafbfbc298 100644
--- a/test/unit/core/array.cu
+++ b/test/unit/core/array.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/bfloat16.cu b/test/unit/core/bfloat16.cu
index d33ff2cc3c..29262fadc6 100644
--- a/test/unit/core/bfloat16.cu
+++ b/test/unit/core/bfloat16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/complex.cu b/test/unit/core/complex.cu
index 003762f719..59812d6e38 100644
--- a/test/unit/core/complex.cu
+++ b/test/unit/core/complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/functional.cu b/test/unit/core/functional.cu
index ab843154ef..a3b98f7037 100644
--- a/test/unit/core/functional.cu
+++ b/test/unit/core/functional.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/half.cu b/test/unit/core/half.cu
index dad1f97a79..a888741f5e 100644
--- a/test/unit/core/half.cu
+++ b/test/unit/core/half.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/matrix.cu b/test/unit/core/matrix.cu
index f012fe9f87..f94605d7b5 100644
--- a/test/unit/core/matrix.cu
+++ b/test/unit/core/matrix.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/matrix_coord.cu b/test/unit/core/matrix_coord.cu
index 841d4cb72a..69d4f0977e 100644
--- a/test/unit/core/matrix_coord.cu
+++ b/test/unit/core/matrix_coord.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/test/unit/core/numeric_conversion.cu b/test/unit/core/numeric_conversion.cu
index 5f8f383987..8fc3128d87 100644
--- a/test/unit/core/numeric_conversion.cu
+++ b/test/unit/core/numeric_conversion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/predicate_vector.cu b/test/unit/core/predicate_vector.cu
index f9a0675c01..3dbe835242 100644
--- a/test/unit/core/predicate_vector.cu
+++ b/test/unit/core/predicate_vector.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/quaternion.cu b/test/unit/core/quaternion.cu
index 69ce928aec..62f8118834 100644
--- a/test/unit/core/quaternion.cu
+++ b/test/unit/core/quaternion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/tensor_ref.cu b/test/unit/core/tensor_ref.cu
index 6bedddc577..f30cc19476 100644
--- a/test/unit/core/tensor_ref.cu
+++ b/test/unit/core/tensor_ref.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/tensor_view.cu b/test/unit/core/tensor_view.cu
index 684ca5b0f2..6ea8d2f313 100644
--- a/test/unit/core/tensor_view.cu
+++ b/test/unit/core/tensor_view.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/test_unit_core.cpp b/test/unit/core/test_unit_core.cpp
index a6dfbf4bbc..bcebec675f 100644
--- a/test/unit/core/test_unit_core.cpp
+++ b/test/unit/core/test_unit_core.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/tfloat32.cu b/test/unit/core/tfloat32.cu
index 9b54603fee..96e4c91389 100644
--- a/test/unit/core/tfloat32.cu
+++ b/test/unit/core/tfloat32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/CMakeLists.txt b/test/unit/epilogue/CMakeLists.txt
index 9de2d56edb..66050010d5 100755
--- a/test/unit/epilogue/CMakeLists.txt
+++ b/test/unit/epilogue/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/epilogue/thread/CMakeLists.txt b/test/unit/epilogue/thread/CMakeLists.txt
index 9b04f7752a..dd43262d1b 100644
--- a/test/unit/epilogue/thread/CMakeLists.txt
+++ b/test/unit/epilogue/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/epilogue/thread/linear_combination.cu b/test/unit/epilogue/thread/linear_combination.cu
index 6518e98738..5ff188a3e8 100644
--- a/test/unit/epilogue/thread/linear_combination.cu
+++ b/test/unit/epilogue/thread/linear_combination.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/thread/linear_combination_planar_complex.cu b/test/unit/epilogue/thread/linear_combination_planar_complex.cu
index 89d1be5e02..e6327a1dee 100644
--- a/test/unit/epilogue/thread/linear_combination_planar_complex.cu
+++ b/test/unit/epilogue/thread/linear_combination_planar_complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/CMakeLists.txt b/test/unit/epilogue/threadblock/CMakeLists.txt
index cb8b7a62d5..b987a05cb1 100755
--- a/test/unit/epilogue/threadblock/CMakeLists.txt
+++ b/test/unit/epilogue/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -22,6 +22,7 @@
 
 cutlass_test_unit_add_executable(
   cutlass_test_unit_epilogue_threadblock
+  
   predicated_tile_iterator.cu
   output_tile_threadmap.cu
   epilogue_simt.cu
diff --git a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
index 76b70f5069..11fa80cf8a 100644
--- a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
+++ b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/epilogue_simt.cu b/test/unit/epilogue/threadblock/epilogue_simt.cu
index 935a812426..72b86cfac1 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
index 25cd8933c5..c6ff649dc8 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
index f3552a1847..0cdadad0c1 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
index db8e68a3a5..d1a2b9d9a5 100644
--- a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -460,7 +460,7 @@ TEST(SM75_Epilogue_threadblock_epilogue, s4_tensor_op_128x32_64x32x32) {
 }
 
 
-TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_256x128_64x64x32) {
+TEST(SM75_Epilogue_threadblock_epilogue, s4_tensor_op_256x128_64x64x32) {
 
   //
   // Define the warp-level matrix multiply
@@ -520,7 +520,7 @@ TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_256x128_64x64x32) {
 }
 
 
-TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_128x256_64x64x32) {
+TEST(SM75_Epilogue_threadblock_epilogue, s4_tensor_op_128x256_64x64x32) {
 
   //
   // Define the warp-level matrix multiply
diff --git a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
index 88fa98cf03..7fc4c7e437 100644
--- a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
index 24752a1df0..3039d1fc1e 100644
--- a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
+++ b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/output_tile_threadmap.cu b/test/unit/epilogue/threadblock/output_tile_threadmap.cu
index 6e6e96e71f..19824e8ae9 100644
--- a/test/unit/epilogue/threadblock/output_tile_threadmap.cu
+++ b/test/unit/epilogue/threadblock/output_tile_threadmap.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
index 40874f7bf1..fddb0e17dc 100644
--- a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
+++ b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/testbed.h b/test/unit/epilogue/threadblock/testbed.h
index 1dc9baa317..ba5241af3b 100644
--- a/test/unit/epilogue/threadblock/testbed.h
+++ b/test/unit/epilogue/threadblock/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/testbed_planar_complex.h b/test/unit/epilogue/threadblock/testbed_planar_complex.h
index 6afa603293..3c2959dbcc 100644
--- a/test/unit/epilogue/threadblock/testbed_planar_complex.h
+++ b/test/unit/epilogue/threadblock/testbed_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/warp/CMakeLists.txt b/test/unit/epilogue/warp/CMakeLists.txt
index dbd7ee65b5..97b942d0f6 100644
--- a/test/unit/epilogue/warp/CMakeLists.txt
+++ b/test/unit/epilogue/warp/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
index 9e94616f72..945732762e 100644
--- a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
index 3522c9e925..cf3ffe50cb 100644
--- a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
index 4931d93718..c6112a7e56 100644
--- a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/CMakeLists.txt b/test/unit/gemm/CMakeLists.txt
index 4ac245716f..ff4280bf30 100644
--- a/test/unit/gemm/CMakeLists.txt
+++ b/test/unit/gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt
index 7ead7eba54..87e495987f 100644
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -123,6 +123,8 @@ cutlass_test_unit_add_executable(
   BATCH_SOURCES ON
   BATCH_SIZE 4
 
+  gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
+
   gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
   gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
   gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -172,7 +174,6 @@ cutlass_test_unit_add_executable(
   BATCH_SOURCES ON
   BATCH_SIZE 4
 
-  gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu
   gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
   gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
   gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
index fc887bce36..dbf2960dcd 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
index d8b9072736..5006bcb253 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without modification, are permitted
  provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
index 03f0b75251..0267531b8e 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
index 77777a66f3..b30226e83a 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
index f6862b0d2d..9de52a182c 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without modification, are permitted
  provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
index b4fb7eba02..079cf81224 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
index 3da9cdbb58..8c62c37888 100644
--- a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
index b0dbbdc856..2ee4e3f807 100644
--- a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
index b15af10764..11f3cc0543 100644
--- a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
index cec5ce60a5..0d9425347a 100644
--- a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
index c7df15d140..b71d02064d 100644
--- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
index 5113d2f800..76a7fe5002 100644
--- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
index 427c1e0e13..5922a4788e 100644
--- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
index 74fbc1f549..7796d5a80e 100644
--- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
index ea3da85d52..aca46c5f03 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
index 167949d8c6..c5ad79bab1 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
index ae72cade2f..4be57a632c 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
index 858fd301fe..abbd4f41dc 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
index 1f4d3e2933..efa04dba6e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
index 2dc224ab2e..147a333b9c 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
index 71f21444cf..bc892a5ed9 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
index bb1665062e..28b75ec28c 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
index 3e8b96584f..ecd5a0463e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
index cd6e48a3a2..19d2850f0e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
index a9f9ea9978..a42c1827f0 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
index d797ed5577..326f11fd3b 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
index 7cf1fad244..2d86068aae 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
index cef53a2dc9..fa82abacb2 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
index be764f5282..410abad050 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
index 25d3e5bee8..616cb998a6 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
index f7c8fb23f2..4208cd509b 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
index 2798007695..4a75ec1601 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
index b4114ffe51..9314d2131b 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
index 6ca8ada8a5..1b034b85ac 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
index 64b697af81..3baadeec11 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
index cff5070599..976b671cb0 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
index 849b7582e6..323a980891 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
index 8a760b02ab..90feba8b33 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
index 9f2c2c542d..b4e09b5aac 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
index aa92606167..2d77ed3ea9 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
index dac3675b84..ffd8e3f73b 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
index 74434cc9fa..db25e6c7d2 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
index 176112d10f..4cdbdfe586 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
index 47e927d450..7b17e7f02e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
index 8ae6464f27..da53084eaa 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
index de19ca0047..0472750584 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
index 0b83c6cbb7..e63e601fcd 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
index a81684241b..ab634268df 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
index 585b1df179..e2330ee64b 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
index ab030e5a97..dcc5581d4e 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
index b8fa4dad8e..82f5869d94 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
index 358aacecd9..47bbf0cd8b 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
index 957bcd2ab0..face7b9382 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
index 7c0f3b406a..015731c884 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
index 972756bba8..9a2a355a6b 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
index ffba9c0dac..dfa302b613 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
index 14030b1d41..1429ed1a23 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
index 9a1918db44..5b104ee1fa 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
index 51a09194e4..e3a1074ff9 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
index 74d64af70d..0a232e0724 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
index d4bc720bca..d6fc20674b 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
index dd0976d9f7..840e99de14 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
index 83c5cd1479..6fc1096ee1 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
index b62d99f78a..f028401cde 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
index 6d78dc9a9b..63368ed81b 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
index 5ea2f9ce00..94c34bccce 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
index 0f773de4f2..caec177f96 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
index 54d6229a0d..fc0dbdf328 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
index d123931e1a..92a8d20366 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
index b1286accd1..7be0e08f05 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
index 5a511540fa..eb91e486c0 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
index 26f41ac2b7..47828166b8 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
index 06498afb9a..a7c64010d4 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
index e377980bbf..d005347b0f 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
index 96f5dcc947..26f1c79ccc 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
index 0497e61945..3cee859958 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
index 0f94d589c6..8c2951dad7 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
index 2163711b84..d6a481a8b3 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
index 91095a945d..e6fcabef5c 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
index 2108eeb4e4..668a193e71 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
index 869b59b51d..0cebe7a129 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
index fda4371705..0efa0a9215 100644
--- a/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
index 7c2b6c6e38..ff2a5c752d 100644
--- a/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
index eec3ca4cdb..96a6320053 100644
--- a/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
index 64fe313c50..120cae0571 100644
--- a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
index 63c765c551..8f7425737e 100644
--- a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
index 99303712e5..f974cf16e1 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
index 993b0b9d5a..f21995c597 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
index 25fd50cfc3..f66ba86d74 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
index 4cc4068170..6acd01359c 100644
--- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
index d53e3c0768..d0bfd412ae 100644
--- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -76,8 +76,7 @@ TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 64x128x128_32x64x128) {
     32,
     32,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 64> testbed;
@@ -117,8 +116,7 @@ TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x128x128_64x64x128) {
     32,
     32,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 64> testbed;
@@ -158,8 +156,7 @@ TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 256x128x128_64x64x128) {
     32,
     32,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 64> testbed;
@@ -199,8 +196,7 @@ TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x256x128_64x64x128) {
     32,
     32,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 64> testbed;
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
index 983dff337f..6f1eef5c96 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
index 8dd541838f..9bf9e45bc9 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without modification, are permitted
  provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
index 01a65b32a5..05827535d6 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
index 33f3b07a2a..76cfe97087 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
index 1a3f7dba85..3ec1553a4d 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without modification, are permitted
  provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
index aaf618267e..5597b61e37 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without modification, are permitted
  provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
index 857df472a7..4cbc34839e 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
index 51d182cd66..68f7c152fc 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
index 90fe6bcfd8..097ef5418a 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
index 393e68bfd6..0ca058ed38 100644
--- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
index c4900e489e..e5fae709e6 100644
--- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -73,8 +73,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x64x64_32x32x64) {
     16,
     16,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
@@ -112,8 +111,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x64x64_64x32x64) {
     16,
     16,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
@@ -151,8 +149,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x128x64_32x64x64) {
     16,
     16,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
@@ -190,8 +187,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x128x64_64x64x64) {
     16,
     16,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
@@ -229,8 +225,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x128x64_64x64x64) {
     16,
     16,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
@@ -268,8 +263,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x256x64_64x64x64) {
     16,
     16,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
@@ -307,8 +301,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x64x64_64x64x64) {
     16,
     16,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
@@ -346,8 +339,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) {
     16,
     16,
     false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
   >;
 
   test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
@@ -358,4 +350,3 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) {
 ////////////////////////////////////////////////////////////////////////////////
 
 #endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
-
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
index 6ac9b71bf2..215e6a6a5e 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
index 9e1076a833..21ab4f746f 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without modification, are permitted
  provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
index cc6e4c3a5d..b139ec4117 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
index 86a678d22b..d0c3bcce80 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
index a86dc2442e..9399725d35 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without modification, are permitted
  provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
index 5b9b1d7d95..26db722085 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without modification, are permitted
  provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
index d53571a2d7..6fda75299b 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
index 024cba0a49..1358cec554 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
index 2d6db336f6..4d7e5b3a42 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without modification, are permitted
  provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
index ac5757e0ee..36c76e61aa 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
index 93642e64b6..2ee6365985 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
index 197e69b710..b5b99164f8 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without modification, are permitted
  provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
index 719e2ac760..9447d611e0 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
index e7a01bed61..e2057f5a13 100644
--- a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
index 39b5f10a70..dfbc64e2a8 100644
--- a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
+++ b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
index 42e991ed09..3f8cc5eeef 100644
--- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
+++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
index 3381f1703a..f4f4fc55db 100644
--- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
index 78c6e8657e..9bc2f10c9a 100644
--- a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
index 11af88897f..6ba1ddf371 100644
--- a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
index a28101f3d5..7eed9680ba 100644
--- a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
index a1a0fd7e31..8374c51889 100644
--- a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
index a63163680b..4dc26800d2 100644
--- a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
index e32441941d..891b5b578a 100644
--- a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
index 301cce7851..901102cd76 100644
--- a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
index df28110a33..aedfdde45f 100644
--- a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
similarity index 96%
rename from test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu
rename to test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
index e7b4405a08..ef66c6a226 100644
--- a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -45,7 +45,7 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -105,7 +105,7 @@ TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32_u
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+#endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/test/unit/gemm/device/multistage_testbed.h b/test/unit/gemm/device/multistage_testbed.h
index f7b6ac8f56..6fb573b918 100644
--- a/test/unit/gemm/device/multistage_testbed.h
+++ b/test/unit/gemm/device/multistage_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/multistage_testbed_interleaved.h b/test/unit/gemm/device/multistage_testbed_interleaved.h
index c98264de01..1b12cd5b29 100644
--- a/test/unit/gemm/device/multistage_testbed_interleaved.h
+++ b/test/unit/gemm/device/multistage_testbed_interleaved.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
index 5aabfca587..680012bcac 100644
--- a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
index c5265ce2b9..0f20a92f73 100644
--- a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
index 9db96c996a..a6072d2804 100644
--- a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
index 0ac7b4c9f8..8162905b31 100644
--- a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
index 1efa9d0446..af5dbb7cd5 100644
--- a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
index 886c0f9c74..d5cb5e7546 100644
--- a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
index a43d0afd5d..84cb465b20 100644
--- a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
index 0175978d00..e9633f5c4b 100644
--- a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
index a3aa5ce840..9cabed9069 100644
--- a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
index d5541939e9..83f5ceb12e 100644
--- a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
index 526bc01a4c..d7c67e2dee 100644
--- a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
index ad464b3018..cfd60a3b14 100644
--- a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_igemm_nn_sm50.cu b/test/unit/gemm/device/simt_igemm_nn_sm50.cu
index 3db133ebfd..be25b52027 100644
--- a/test/unit/gemm/device/simt_igemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_igemm_nt_sm50.cu b/test/unit/gemm/device/simt_igemm_nt_sm50.cu
index 01f56ea030..8a81a7b48c 100644
--- a/test/unit/gemm/device/simt_igemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_igemm_tn_sm50.cu b/test/unit/gemm/device/simt_igemm_tn_sm50.cu
index 3692ec2c3b..2a871ecc5d 100644
--- a/test/unit/gemm/device/simt_igemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_igemm_tt_sm50.cu b/test/unit/gemm/device/simt_igemm_tt_sm50.cu
index 2254669b36..f86e8e975e 100644
--- a/test/unit/gemm/device/simt_igemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61.cu b/test/unit/gemm/device/simt_int8_igemm_sm61.cu
index 1364a38cff..ca5f3e7b6b 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,9 +72,7 @@
     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,       \
     2                                                                 \
   >;                                                                  \
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());               \
-
-
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
index 4e4308ff37..cad5de367d 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
index 88c72aee4c..e7badc070a 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
index 0412d751c3..64e524b419 100644
--- a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
index 1adb9b5ae4..e520e29810 100644
--- a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
index f0fe1ebd94..3a1b5de6ea 100644
--- a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
+++ b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
index 0c00e56084..aa3a0d6eed 100644
--- a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
index c183fbff34..9ed5f1292c 100644
--- a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
+++ b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
index ce7ab9a7e0..c148c9564b 100644
--- a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_sm50.py b/test/unit/gemm/device/simt_sm50.py
index f53dae2715..525fa2a8c1 100644
--- a/test/unit/gemm/device/simt_sm50.py
+++ b/test/unit/gemm/device/simt_sm50.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -123,7 +123,7 @@
 
         # write file header
         out.write("/***************************************************************************************************\n"
-" * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.\n"
+" * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.\n"
 " *\n"
 " * Redistribution and use in source and binary forms, with or without modification, are permitted\n"
 " * provided that the following conditions are met:\n"
diff --git a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
index 7731559a81..e325ced874 100644
--- a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
index 17ea98203a..2a309a4eca 100644
--- a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
index 175c312868..2a9f33d4bb 100644
--- a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
index 544e626c5a..013a1ba53e 100644
--- a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h
index c2bf40ec21..24ec13e495 100644
--- a/test/unit/gemm/device/testbed.h
+++ b/test/unit/gemm/device/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -284,10 +284,13 @@ struct Testbed {
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
 
-		// Waive test if insufficient CUDA device
-		if (!sufficient()) {
-			return true;
-		}
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
 
     this->initialize(problem_size);
 
diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h
index a3e1353ee1..941fa93fba 100644
--- a/test/unit/gemm/device/testbed_complex.h
+++ b/test/unit/gemm/device/testbed_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -101,6 +101,7 @@ struct TestbedComplex : public Testbed<Gemm> {
     return this->compare_reference(problem_size, alpha, beta);
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
 	bool sufficient() const {
 		//
 		// Determine SMEM requirements and waive if not satisfied
@@ -136,10 +137,13 @@ struct TestbedComplex : public Testbed<Gemm> {
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
 
-		// Waive the test if device not sufficient
-		if (!sufficient()) {
-			return true;
-		}
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
 
 		//
 		// Initialize workspace
diff --git a/test/unit/gemm/device/testbed_interleaved.h b/test/unit/gemm/device/testbed_interleaved.h
index 6e14f87f6e..3ea1d222b9 100644
--- a/test/unit/gemm/device/testbed_interleaved.h
+++ b/test/unit/gemm/device/testbed_interleaved.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -137,9 +137,13 @@ struct InterleavedTestbed {
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
 
-		if (!sufficient()) {
-			return true;
-		}
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
     
     //
     // Allocate the GEMM workspace
diff --git a/test/unit/gemm/device/testbed_planar_complex.h b/test/unit/gemm/device/testbed_planar_complex.h
index 0e4e561e42..3bc997757a 100644
--- a/test/unit/gemm/device/testbed_planar_complex.h
+++ b/test/unit/gemm/device/testbed_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -112,10 +112,47 @@ class TestbedPlanarComplex {
     tensor_D.sync_device();
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+  
   bool run(
       cutlass::complex<ElementCompute> alpha = {1, 0},
       cutlass::complex<ElementCompute> beta = {0, 0}) {
 
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
     initialize();
 
     int batch_count = 1;
diff --git a/test/unit/gemm/device/testbed_sanity.h b/test/unit/gemm/device/testbed_sanity.h
index 025fb3874d..90f61590d5 100644
--- a/test/unit/gemm/device/testbed_sanity.h
+++ b/test/unit/gemm/device/testbed_sanity.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/testbed_sparse.h b/test/unit/gemm/device/testbed_sparse.h
index 28901a9867..e2611210d1 100644
--- a/test/unit/gemm/device/testbed_sparse.h
+++ b/test/unit/gemm/device/testbed_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -295,6 +295,7 @@ struct SparseTestbed {
     return compare_reference(problem_size, alpha, beta);
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
   bool sufficient() const {
     //
     // Determine SMEM requirements and waive if not satisfied
@@ -330,10 +331,13 @@ struct SparseTestbed {
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
 
-		// Waive test if insufficient CUDA device
-		if (!sufficient()) {
-			return true;
-		}
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
 
     this->initialize(problem_size);
 
diff --git a/test/unit/gemm/device/testbed_splitk.h b/test/unit/gemm/device/testbed_splitk.h
index 792d73923a..5e5d7b329f 100644
--- a/test/unit/gemm/device/testbed_splitk.h
+++ b/test/unit/gemm/device/testbed_splitk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -61,6 +61,35 @@ struct TestbedSplitK : public Testbed<Gemm> {
   ):
     Base(init_A_, init_B_, init_C_, seed_) { }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+  
   /// Executes one test
   bool run(
     cutlass::gemm::GemmCoord problem_size, 
@@ -68,6 +97,14 @@ struct TestbedSplitK : public Testbed<Gemm> {
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
 
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
     this->initialize(problem_size);
 
     //
diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h
index fb36f10e25..4252fd953b 100644
--- a/test/unit/gemm/device/testbed_universal.h
+++ b/test/unit/gemm/device/testbed_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -250,6 +250,7 @@ struct TestbedUniversal {
     return compare_reference(problem_size, alpha, beta);
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
   bool sufficient() const {
     //
     // Determine SMEM requirements and waive if not satisfied
@@ -286,10 +287,13 @@ struct TestbedUniversal {
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
 
-		// Waive test if insufficient CUDA device
-		if (!sufficient()) {
-			return true;
-		}
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
 
     this->initialize(problem_size);
 
diff --git a/test/unit/gemm/device/testbed_utils.h b/test/unit/gemm/device/testbed_utils.h
index 9325b40fe3..2a77e6c8d0 100644
--- a/test/unit/gemm/device/testbed_utils.h
+++ b/test/unit/gemm/device/testbed_utils.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/kernel/batched_gemv.cu b/test/unit/gemm/kernel/batched_gemv.cu
new file mode 100755
index 0000000000..bf479641d1
--- /dev/null
+++ b/test/unit/gemm/kernel/batched_gemv.cu
@@ -0,0 +1,1076 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "testbed_gemv.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcr_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcr_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcr_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcr_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_rcr_alpha_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size, -0.5f);
+}
+
+TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_rcr_alpha_beta_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size, 4.5f, -0.5f);
+}
+
+TEST(SM50_batched_gemv, 1x64x24x4096_1x8x4x64_1x1x4x64_rcr_alpha_beta_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 24, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size, cutlass::half_t(4.5f), cutlass::half_t(-0.5f));
+}
+
+///
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcr_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcr_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcr_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcr_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+///
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcr_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcr_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcr_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcr_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+///
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcr_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcr_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcr_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcr_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+/////////////
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_crc_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_crc_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_crc_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_crc_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+///
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_crc_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_crc_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_crc_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_crc_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+///
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_crc_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_crc_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_crc_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float,  cutlass::half_t,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_crc_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+///
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_crc_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_crc_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_crc_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_crc_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_crc_alpha_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size, -0.5f);
+}
+
+TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_crc_alpha_beta_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size, 4.5f, -0.5f);
+}
+
+TEST(SM50_batched_gemv, 1x64x24x4096_1x8x4x64_1x1x4x64_crc_alpha_beta_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 24, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size, cutlass::half_t(4.5f), cutlass::half_t(-0.5f));
+}
+
+/////////////
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcc_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcc_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcc_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcc_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+///
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcc_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcc_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcc_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcc_fp16_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+///
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcc_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcc_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcc_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float,  cutlass::half_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcc_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+///
+
+TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcc_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 1;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcc_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 2;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcc_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>;
+  static int const kBatchTileSize = 8;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcc_i8_i32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    int8_t, int32_t, int32_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size);
+}
+
+TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_rcc_alpha_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size, -0.5f);
+}
+
+TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_rcc_alpha_beta_fp32_fp32)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    float, float, float,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size, 4.5f, -0.5f);
+}
+
+TEST(SM50_batched_gemv, 1x64x24x4096_1x8x4x64_1x1x4x64_rcc_alpha_beta_fp16_fp16)
+{
+  cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 24, 4096);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>;
+  static int const kBatchTileSize = 64;
+
+  test::gemm::kernel::batched_gemv_kernel_test<
+                                    ThreadBlockShape,
+                                    ThreadShape,
+                                    cutlass::half_t, float, cutlass::half_t,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    cutlass::layout::ColumnMajor,
+                                    kBatchTileSize>(problem_size, cutlass::half_t(4.5f), cutlass::half_t(-0.5f));
+}
diff --git a/test/unit/gemm/kernel/testbed_gemv.h b/test/unit/gemm/kernel/testbed_gemv.h
new file mode 100755
index 0000000000..fb9c7d7076
--- /dev/null
+++ b/test/unit/gemm/kernel/testbed_gemv.h
@@ -0,0 +1,352 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemv.h"
+#include "cutlass/gemm/kernel/gemv_batched_strided.h"
+
+namespace test {
+namespace gemm {
+namespace kernel {
+
+template<typename ThreadBlockShape_,
+        typename ThreadShape_,
+        typename ElementAB_,
+        typename ElementAccumulator_,
+        typename ElementCD_,
+        typename LayoutA_,
+        typename LayoutB_,
+        typename LayoutCD_,
+        int LDG_B = 1, // batch tile size
+        bool DEBUG=false>
+void batched_gemv_kernel_test(cutlass::gemm::BatchedGemmCoord problem_size,
+                              ElementCD_ alpha = ElementCD_(1),
+                              ElementCD_ beta = ElementCD_(0),
+                              bool perf_test = false,
+                              int perf_test_iter = 1)
+{
+    using ThreadBlockShape = ThreadBlockShape_;
+    using ThreadShape = ThreadShape_;
+    using ElementA = ElementAB_;
+    using LayoutA = LayoutA_;
+    using ElementB = ElementAB_;
+    using LayoutB = LayoutB_;
+    using ElementAccumulator = ElementCD_;
+    using ElementCD = ElementCD_;
+    using LayoutCD = LayoutCD_;
+
+    using GemvKernel = cutlass::gemm::kernel::DefaultGemv<ThreadBlockShape,
+                                                          ThreadShape,
+                                                          ElementA,
+                                                          LayoutA,
+                                                          ElementB,
+                                                          LayoutB,
+                                                          ElementCD,
+                                                          LayoutCD,
+                                                          ElementAccumulator>;
+
+    using ThreadBlockGemv = typename GemvKernel::ThreadBlockGemv;
+    using ThreadBlockSwizzle = typename GemvKernel::ThreadBlockSwizzle;
+
+    if (DEBUG)
+    { 
+        problem_size = cutlass::gemm::BatchedGemmCoord(
+                        problem_size.m(), problem_size.n(), problem_size.k(), 1);
+    }
+
+    // Create host tensors that will be the backing store for the batches
+    // Note that no device memory is initially allocated
+    cutlass::HostTensor<ElementA, LayoutA> matrix_A({problem_size.m(), problem_size.k()}, false); 
+    cutlass::HostTensor<ElementB, LayoutB> matrix_B({problem_size.k(), problem_size.n()}, false); 
+    cutlass::HostTensor<ElementCD, LayoutCD> matrix_C_computed({problem_size.m(), problem_size.n()}, false); 
+    cutlass::HostTensor<ElementCD, LayoutCD> matrix_C_reference({problem_size.m(), problem_size.n()}, false);
+
+    // Reserve memory for the batch of tensors
+    matrix_A.reserve(problem_size.m()*problem_size.k()*problem_size.batch());
+    matrix_B.reserve(problem_size.n()*problem_size.k()*problem_size.batch());
+    matrix_C_computed.reserve(problem_size.m()*problem_size.n()*problem_size.batch());
+    matrix_C_reference.reserve(problem_size.m()*problem_size.n()*problem_size.batch(), false);
+
+    // Fill eatch tensor batch
+    const int seed = 9876;
+    for (int b = 0; b < problem_size.batch(); b++)
+    {
+        if(DEBUG)
+        {
+            cutlass::reference::host::BlockFillSequential(
+                matrix_A.host_data_ptr_offset(b*matrix_A.capacity()), matrix_A.capacity());
+            cutlass::reference::host::BlockFillSequential(
+                matrix_B.host_data_ptr_offset(b*matrix_B.capacity()), matrix_B.capacity());
+        }
+        else
+        {
+            cutlass::reference::host::TensorFillRandomUniform(
+                matrix_A.host_view(b*matrix_A.capacity()),
+                seed + 1660,
+                8,
+                -8,
+                0
+            );
+
+            cutlass::reference::host::TensorFillRandomUniform(
+                matrix_B.host_view(b*matrix_B.capacity()),
+                seed + 1880,
+                8,
+                -8,
+                0
+            );
+        }
+
+        cutlass::reference::host::TensorFill(matrix_C_computed.host_view(b*matrix_C_computed.capacity()));
+        cutlass::reference::host::TensorFill(matrix_C_reference.host_view(b*matrix_C_reference.capacity()));
+    }
+
+    matrix_A.sync_device();
+    matrix_B.sync_device();
+    matrix_C_computed.sync_device();
+
+    ThreadBlockSwizzle swizzle;
+
+    cutlass::gemm::BatchedGemmCoord tiled_size{ThreadBlockShape::kM,
+                                                ThreadBlockShape::kN,
+                                                problem_size.k(), // no split-k
+                                                DEBUG ? 1 : LDG_B };
+
+    cutlass::gemm::BatchedGemmCoord tiled_shape = swizzle.get_tiled_shape(problem_size, tiled_size);
+
+    #if 0 
+    printf("tiled_size = %d %d %d %d\n", tiled_size.m(), tiled_size.n(), tiled_size.k(), tiled_size.batch());
+    printf("tiled_shape = %d %d %d %d\n", tiled_shape.m(), tiled_shape.n(), tiled_shape.k(), tiled_shape.batch());
+    #endif
+
+    // No split-k
+    EXPECT_EQ(tiled_size.k(), problem_size.k());
+
+    dim3 grid = swizzle.get_grid_shape(tiled_shape);
+    dim3 block(tiled_size.n() / ThreadShape::kN, tiled_size.batch(), tiled_size.k() / problem_size.k());
+
+    // Some sanity checks
+    EXPECT_TRUE( block.x*block.y*block.z <= 1024 );
+    EXPECT_TRUE( block.x <= 1024 );
+    EXPECT_TRUE( block.y <= 1024 );
+    EXPECT_TRUE( block.z <= 64 );
+
+    #if 0 
+    printf("grid dim = %d, %d, %d\n", grid.x, grid.y, grid.z);
+    printf("block dim = %d, %d, %d\n", block.x, block.y, block.z);
+    #endif
+
+    cudaError_t result;
+    cudaEvent_t start_event, end_event;
+ 
+    for (int iter = 0; iter < (perf_test ? (perf_test_iter+1) : 1); ++iter)
+    {
+        if (perf_test && iter == 1)
+        {
+            result = cudaEventCreate(&start_event);
+            EXPECT_EQ(result, cudaSuccess);
+            
+            result = cudaEventCreate(&end_event);
+            EXPECT_EQ(result, cudaSuccess);
+    
+            result = cudaEventRecord(start_event);
+            EXPECT_EQ(result, cudaSuccess);
+        }
+
+        if (beta == ElementCD(0))
+        {
+            if (alpha == ElementCD(1))
+            {
+                cutlass::gemm::kernel::GemvBatchedStrided<GemvKernel><<< grid, block >>>(
+                    problem_size,
+                    matrix_A.device_ref(),
+                    matrix_A.capacity(),
+                    matrix_B.device_ref(),
+                    matrix_B.capacity(),
+                    matrix_C_computed.device_ref(),
+                    matrix_C_computed.capacity()
+                );
+            }
+            else
+            {
+                cutlass::gemm::kernel::GemvBatchedStrided<GemvKernel><<< grid, block >>>(
+                    problem_size,
+                    alpha,
+                    matrix_A.device_ref(),
+                    matrix_A.capacity(),
+                    matrix_B.device_ref(),
+                    matrix_B.capacity(),
+                    matrix_C_computed.device_ref(),
+                    matrix_C_computed.capacity()
+                );
+            }
+        }
+        else
+        {
+            cutlass::gemm::kernel::GemvBatchedStrided<GemvKernel, ElementCD, false><<< grid, block >>>(
+                problem_size,
+                alpha,
+                beta,
+                matrix_A.device_ref(),
+                matrix_A.capacity(),
+                matrix_B.device_ref(),
+                matrix_B.capacity(),
+                matrix_C_computed.device_ref(),
+                matrix_C_computed.capacity(),
+                matrix_C_computed.device_ref(),
+                matrix_C_computed.capacity()
+            );
+        }
+
+        if (iter == 0)
+        {
+            result = cudaGetLastError();
+            EXPECT_EQ(result, cudaSuccess) << " kernel error: " << cudaGetErrorString(result);        
+        }
+    }
+
+    if (perf_test)
+    {
+        result = cudaEventRecord(end_event);
+        EXPECT_EQ(result, cudaSuccess);
+    }
+
+    result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " kernel error: " << cudaGetErrorString(result);
+
+    if (perf_test)
+    {
+        float ms;
+        result = cudaEventElapsedTime(&ms, start_event, end_event);
+        EXPECT_EQ(result, cudaSuccess);
+        
+        double flops = (double(problem_size.m()) *
+                        double(problem_size.n()) *
+                        double(problem_size.k()) *
+                        double(problem_size.batch()) * 2); // 2 for MAC
+    
+        double read_bytes = double(problem_size.batch()) * (sizeof(ElementA)*double(problem_size.m())*double(problem_size.k()) + 
+                                                            sizeof(ElementB)*double(problem_size.k())*double(problem_size.n()));
+
+        double write_bytes = double(problem_size.batch()) * (sizeof(ElementCD)*double(problem_size.m())*double(problem_size.n()));
+
+        double avg_runtime = double(ms) / perf_test_iter;
+        double gflops_per_sec = flops / 1.0e6 / avg_runtime;
+        double read_bandwidth = read_bytes / 1.0e6 / avg_runtime;
+        double write_bandwidth = write_bytes / 1.0e6 / avg_runtime;
+
+        std::cout << "\n\nProblem size: "
+                  << problem_size.m() 
+                  << " x " << problem_size.n()
+                  << " x " << problem_size.k()
+                  << " x " << problem_size.batch() 
+                  << std::endl;
+
+        std::cout << "  GFLOPs:     " << gflops_per_sec << std::endl;
+        std::cout << "BW (R/W):     " << read_bandwidth << " / " << write_bandwidth << " GB/sec" << std::endl;
+        std::cout << " Runtime:     " << avg_runtime << " ms" << std::endl;
+    }
+    else
+    {
+        matrix_C_computed.sync_host();
+
+        // Compute the batched gemms
+        for (int b = 0; b < problem_size.batch(); b++)
+        {
+          cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                         ElementCD, LayoutCD, ElementCD,
+                                         ElementCD>
+              reference_gemm;
+
+          reference_gemm(
+              problem_size.mnk(), alpha,
+              matrix_A.host_ref(b * matrix_A.capacity()),
+              matrix_B.host_ref(b * matrix_B.capacity()), beta,
+              matrix_C_reference.host_ref(b * matrix_C_computed.capacity()));
+
+          bool passed = cutlass::reference::host::TensorEquals(
+              matrix_C_computed.host_view(b * matrix_C_computed.capacity()),
+              matrix_C_reference.host_view(b * matrix_C_reference.capacity()));
+
+          EXPECT_TRUE(passed)
+              //<< "A:\n" << matrix_A.host_view() << "\n"
+              //<< "B:\n" << matrix_B.host_view() << "\n"
+              << "Batch: " << b << "\n"
+              << "Reference:\n"
+              << matrix_C_reference.host_view(b * matrix_C_reference.capacity())
+              << "\n"
+              << "Computed:\n"
+              << matrix_C_computed.host_view(b * matrix_C_computed.capacity())
+              << "\n";
+        }
+    }
+}
+
+template<typename ThreadBlockShape_,
+        typename ThreadShape_,
+        typename ElementAB_,
+        typename ElementAccumulator_,
+        typename ElementCD_,
+        typename LayoutA_,
+        typename LayoutB_,
+        typename LayoutCD_,
+        int LDG_B = 1, // batch tile size
+        bool DEBUG=false>
+void batched_gemv_kernel_perf_test(cutlass::gemm::BatchedGemmCoord problem_size,
+                                   ElementCD_ alpha = ElementCD_(1),
+                                   ElementCD_ beta = ElementCD_(0),
+                                   int iter = 50)
+{
+    batched_gemv_kernel_test<ThreadBlockShape_,
+                             ThreadShape_,
+                             ElementAB_,
+                             ElementAccumulator_,
+                             ElementCD_,
+                             LayoutA_,
+                             LayoutB_,
+                             LayoutCD_,
+                             LDG_B,
+                             DEBUG>(problem_size, alpha, beta, true, iter);
+}
+    
+} // namespace threadblock
+} // namespace kernel
+} // namespace test
diff --git a/test/unit/gemm/thread/CMakeLists.txt b/test/unit/gemm/thread/CMakeLists.txt
index 48ca115728..af84c9d0a7 100644
--- a/test/unit/gemm/thread/CMakeLists.txt
+++ b/test/unit/gemm/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/gemm_sm50.cu b/test/unit/gemm/thread/gemm_sm50.cu
index 4265922841..c28fc20c23 100644
--- a/test/unit/gemm/thread/gemm_sm50.cu
+++ b/test/unit/gemm/thread/gemm_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/gemm_sm60.cu b/test/unit/gemm/thread/gemm_sm60.cu
index b0b9fdb5b7..3725ccbbd6 100644
--- a/test/unit/gemm/thread/gemm_sm60.cu
+++ b/test/unit/gemm/thread/gemm_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/gemm_sm61.cu b/test/unit/gemm/thread/gemm_sm61.cu
index f6e7724dd8..50a8ba7839 100644
--- a/test/unit/gemm/thread/gemm_sm61.cu
+++ b/test/unit/gemm/thread/gemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/host/CMakeLists.txt b/test/unit/gemm/thread/host/CMakeLists.txt
index c58540264d..136d0f33c4 100644
--- a/test/unit/gemm/thread/host/CMakeLists.txt
+++ b/test/unit/gemm/thread/host/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/host/gemm_sm60_host.cu b/test/unit/gemm/thread/host/gemm_sm60_host.cu
index 346b80cbe2..aef63790ff 100644
--- a/test/unit/gemm/thread/host/gemm_sm60_host.cu
+++ b/test/unit/gemm/thread/host/gemm_sm60_host.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/host/testbed_host.h b/test/unit/gemm/thread/host/testbed_host.h
index 4d5e441dd5..ef24bbc30f 100644
--- a/test/unit/gemm/thread/host/testbed_host.h
+++ b/test/unit/gemm/thread/host/testbed_host.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/testbed.h b/test/unit/gemm/thread/testbed.h
index bdfb8278f4..175cd4cdd3 100644
--- a/test/unit/gemm/thread/testbed.h
+++ b/test/unit/gemm/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/CMakeLists.txt b/test/unit/gemm/threadblock/CMakeLists.txt
index f4f074fe99..7ad3ca784b 100644
--- a/test/unit/gemm/threadblock/CMakeLists.txt
+++ b/test/unit/gemm/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/batched_gemv.cu b/test/unit/gemm/threadblock/batched_gemv.cu
index 94ae947bd2..f4a9d425a3 100644
--- a/test/unit/gemm/threadblock/batched_gemv.cu
+++ b/test/unit/gemm/threadblock/batched_gemv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/epilogue_workspace.cu b/test/unit/gemm/threadblock/epilogue_workspace.cu
index 1301aeb4dd..b627a5a96a 100644
--- a/test/unit/gemm/threadblock/epilogue_workspace.cu
+++ b/test/unit/gemm/threadblock/epilogue_workspace.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_multistage.cu b/test/unit/gemm/threadblock/mma_multistage.cu
index e4a030d6fa..8e76904189 100644
--- a/test/unit/gemm/threadblock/mma_multistage.cu
+++ b/test/unit/gemm/threadblock/mma_multistage.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse.cu b/test/unit/gemm/threadblock/mma_multistage_sparse.cu
index 13eb180e05..ca5b259c6d 100644
--- a/test/unit/gemm/threadblock/mma_multistage_sparse.cu
+++ b/test/unit/gemm/threadblock/mma_multistage_sparse.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
index d667d8f550..a947af7f5c 100644
--- a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed.h b/test/unit/gemm/threadblock/mma_multistage_testbed.h
index 6b8dc94fb6..84dfdbdb5c 100644
--- a/test/unit/gemm/threadblock/mma_multistage_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -267,6 +267,9 @@ struct Testbed {
           cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
 
       if (result != cudaSuccess) {
+        if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+          std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+        }
         return true;
       }
 
@@ -275,7 +278,10 @@ struct Testbed {
           cudaFuncAttributePreferredSharedMemoryCarveout, 100);
 
       if (result != cudaSuccess) {
-          return true;
+        if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+          std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+        }
+        return true;
       }
     }
 
diff --git a/test/unit/gemm/threadblock/mma_pipelined_simt.cu b/test/unit/gemm/threadblock/mma_pipelined_simt.cu
index 522b029adb..010e4306c4 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_simt.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
index c9c714bcf6..301b8ea878 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
index e4125eb4f0..134712b660 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm80.cu b/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
index 14dd68e72d..7cd16006ef 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed.h b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
index 8190c50a41..ee71c51a6e 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h
+++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  *modification, are permitted provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
index 4fb964c1ae..6214359b26 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
index fd2ae356fa..c67a24740b 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu b/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
index ebcf0a355e..4465a3aa8f 100644
--- a/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
+++ b/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
index 148e34d959..e1b537d556 100644
--- a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
+++ b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  *modification, are permitted provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
index 8c687f8810..146849d923 100644
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
index 262269b75d..909e56c4f4 100644
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/CMakeLists.txt b/test/unit/gemm/warp/CMakeLists.txt
index 695508fa5a..14a85df83e 100644
--- a/test/unit/gemm/warp/CMakeLists.txt
+++ b/test/unit/gemm/warp/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -29,6 +29,7 @@ cutlass_test_unit_add_executable(
   gemm_sm75.cu
   gemm_sm80.cu
   gemm_complex_sm80.cu
+  gemm_sparse_sm80.cu
   gemm_gaussian_complex_sm80.cu
   wmma_sm70.cu
   wmma_sm72.cu
diff --git a/test/unit/gemm/warp/gemm_complex_sm80.cu b/test/unit/gemm/warp/gemm_complex_sm80.cu
index 99effe4004..abc26487aa 100644
--- a/test/unit/gemm/warp/gemm_complex_sm80.cu
+++ b/test/unit/gemm/warp/gemm_complex_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
index 43ad2dfd85..682d37b559 100644
--- a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
+++ b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/gemm_sm50.cu b/test/unit/gemm/warp/gemm_sm50.cu
index bb4ba5be58..88b84d8743 100644
--- a/test/unit/gemm/warp/gemm_sm50.cu
+++ b/test/unit/gemm/warp/gemm_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -35,7 +35,7 @@
 #include "testbed.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
+// NT SMEM layout
 TEST(SM50_warp_gemm_f32_col_row_col, 32x16x1_4x4x1) {
 
   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
@@ -58,6 +58,78 @@ TEST(SM50_warp_gemm_f32_col_row_col, 32x16x1_4x4x1) {
   test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8> >().run();
 }
 
+// TN SMEM layout
+TEST(SM50_warp_gemm_f32_row_col_col, 32x16x1_4x4x1) {
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+    cutlass::MatrixShape<8, 4>,
+    cutlass::layout::ColumnMajorInterleaved<2>,
+    cutlass::gemm::GemmShape<4, 4, 1>
+  >;
+
+  using Mma = cutlass::gemm::warp::MmaSimt<
+    cutlass::gemm::GemmShape<32, 16, 8>,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    Policy
+  >;
+
+  test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8> >().run();
+}
+
+// TT SMEM layout
+TEST(SM50_warp_gemm_f32_row_row_col, 32x16x1_4x4x1) {
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+    cutlass::MatrixShape<8, 4>,
+    cutlass::layout::ColumnMajorInterleaved<2>,
+    cutlass::gemm::GemmShape<4, 4, 1>
+  >;
+
+  using Mma = cutlass::gemm::warp::MmaSimt<
+    cutlass::gemm::GemmShape<32, 16, 8>,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    Policy
+  >;
+
+  test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8> >().run();
+}
+
+// NN SMEM layout
+TEST(SM50_warp_gemm_f32_col_col_col, 32x16x1_4x4x1) {
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+    cutlass::MatrixShape<8, 4>,
+    cutlass::layout::ColumnMajorInterleaved<2>,
+    cutlass::gemm::GemmShape<4, 4, 1>
+  >;
+
+  using Mma = cutlass::gemm::warp::MmaSimt<
+    cutlass::gemm::GemmShape<32, 16, 8>,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    Policy
+  >;
+
+  test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8> >().run();
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// NT SMEM layout
 TEST(SM50_warp_gemm_f32_col_row_row, 16x32x1_4x4x1) {
 
   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
@@ -80,8 +152,31 @@ TEST(SM50_warp_gemm_f32_col_row_row, 16x32x1_4x4x1) {
   test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8> >().run();
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+// TN SMEM layout
+TEST(SM50_warp_gemm_f32_row_col_row, 16x32x1_4x4x1) {
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+    cutlass::MatrixShape<4, 8>,
+    cutlass::layout::RowMajorInterleaved<2>,
+    cutlass::gemm::GemmShape<4, 4, 1>
+  >;
+
+  using Mma = cutlass::gemm::warp::MmaSimt<
+    cutlass::gemm::GemmShape<16, 32, 8>,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    Policy
+  >;
+
+  test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8> >().run();
+}
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// NT SMEM layout
 TEST(SM50_warp_gemm_f32_col_row_col, 32x16x1_2x2x1) {
 
   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
@@ -126,8 +221,52 @@ TEST(SM50_warp_gemm_f32_col_row_row, 32x16x1_2x2x1) {
   test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8>>().run();
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+// TN SMEM layout
+TEST(SM50_warp_gemm_f32_row_col_col, 32x16x1_2x2x1) {
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+    cutlass::MatrixShape<8, 4>,
+    cutlass::layout::ColumnMajorInterleaved<2>,
+    cutlass::gemm::GemmShape<2, 2, 1>
+  >;
+
+  using Mma = cutlass::gemm::warp::MmaSimt<
+    cutlass::gemm::GemmShape<32, 16, 8>,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    Policy
+  >;
 
+  test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8>>().run();
+}
+
+TEST(SM50_warp_gemm_f32_row_col_row, 32x16x1_2x2x1) {
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+    cutlass::MatrixShape<8, 4>,
+    cutlass::layout::RowMajorInterleaved<2>,
+    cutlass::gemm::GemmShape<2, 2, 1>
+  >;
+
+  using Mma = cutlass::gemm::warp::MmaSimt<
+    cutlass::gemm::GemmShape<32, 16, 8>,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    Policy
+  >;
+
+  test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8>>().run();
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// NT SMEM layout
 TEST(SM50_warp_gemm_f32_col_row_col, 32x64x1_4x4x1) {
 
   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
@@ -172,6 +311,50 @@ TEST(SM50_warp_gemm_f32_col_row_row, 32x64x1_4x4x1) {
   test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8>>().run();
 }
 
+// TN SMEM layout
+TEST(SM50_warp_gemm_f32_row_col_col, 32x64x1_4x4x1) {
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+    cutlass::MatrixShape<8, 4>,
+    cutlass::layout::ColumnMajorInterleaved<2>,
+    cutlass::gemm::GemmShape<4, 4, 1>
+  >;
+
+  using Mma = cutlass::gemm::warp::MmaSimt<
+    cutlass::gemm::GemmShape<64, 32, 8>,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    Policy
+  >;
+
+  test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8>>().run();
+}
+
+TEST(SM50_warp_gemm_f32_row_col_row, 32x64x1_4x4x1) {
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+    cutlass::MatrixShape<4, 8>,
+    cutlass::layout::RowMajorInterleaved<2>,
+    cutlass::gemm::GemmShape<4, 4, 1>
+  >;
+
+  using Mma = cutlass::gemm::warp::MmaSimt<
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    Policy
+  >;
+
+  test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8>>().run();
+}
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM50_warp_gemm_complex_f32_col_row_col, 64x32x1_2x2x1) {
@@ -409,5 +592,4 @@ TEST(SM50_warp_gemm_complex_f64_col_row_row, 32x16x1_1x1x1) {
 
   test::gemm::warp::Testbed<Mma, cutlass::gemm::GemmShape<128, 128, 8>>().run();
 }
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/warp/gemm_sm60.cu b/test/unit/gemm/warp/gemm_sm60.cu
index 4f2f3f1582..2196d10415 100644
--- a/test/unit/gemm/warp/gemm_sm60.cu
+++ b/test/unit/gemm/warp/gemm_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/gemm_sm61.cu b/test/unit/gemm/warp/gemm_sm61.cu
index 63e07165b6..71a905b5eb 100644
--- a/test/unit/gemm/warp/gemm_sm61.cu
+++ b/test/unit/gemm/warp/gemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/gemm_sm70.cu b/test/unit/gemm/warp/gemm_sm70.cu
index 3785290e5c..00678f3040 100644
--- a/test/unit/gemm/warp/gemm_sm70.cu
+++ b/test/unit/gemm/warp/gemm_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/gemm_sm75.cu b/test/unit/gemm/warp/gemm_sm75.cu
index 144475cae4..202e543640 100644
--- a/test/unit/gemm/warp/gemm_sm75.cu
+++ b/test/unit/gemm/warp/gemm_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/gemm_sm80.cu b/test/unit/gemm/warp/gemm_sm80.cu
index 0f736b1355..32abb54167 100644
--- a/test/unit/gemm/warp/gemm_sm80.cu
+++ b/test/unit/gemm/warp/gemm_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/gemm_sparse_sm80.cu b/test/unit/gemm/warp/gemm_sparse_sm80.cu
index 8df0846076..6ae76c11f9 100644
--- a/test/unit/gemm/warp/gemm_sparse_sm80.cu
+++ b/test/unit/gemm/warp/gemm_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/testbed.h b/test/unit/gemm/warp/testbed.h
index 3cc00fb447..cc5b55b26f 100644
--- a/test/unit/gemm/warp/testbed.h
+++ b/test/unit/gemm/warp/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/wmma_sm70.cu b/test/unit/gemm/warp/wmma_sm70.cu
index 5b9ce63db1..6d777acf73 100644
--- a/test/unit/gemm/warp/wmma_sm70.cu
+++ b/test/unit/gemm/warp/wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/wmma_sm72.cu b/test/unit/gemm/warp/wmma_sm72.cu
index 89bfbb5945..3a0c80f687 100644
--- a/test/unit/gemm/warp/wmma_sm72.cu
+++ b/test/unit/gemm/warp/wmma_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/wmma_sm75.cu b/test/unit/gemm/warp/wmma_sm75.cu
index 3818793e84..0751daeb5c 100644
--- a/test/unit/gemm/warp/wmma_sm75.cu
+++ b/test/unit/gemm/warp/wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/layout/CMakeLists.txt b/test/unit/layout/CMakeLists.txt
index 29ebdbdd30..df496bc00e 100644
--- a/test/unit/layout/CMakeLists.txt
+++ b/test/unit/layout/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/layout/matrix.cu b/test/unit/layout/matrix.cu
index e463f0974e..2f686ca209 100644
--- a/test/unit/layout/matrix.cu
+++ b/test/unit/layout/matrix.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/test/unit/layout/tensor.cu b/test/unit/layout/tensor.cu
index b4a43fb3a9..68e1dfc16f 100644
--- a/test/unit/layout/tensor.cu
+++ b/test/unit/layout/tensor.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/test/unit/layout/tensor_nhwc.cu b/test/unit/layout/tensor_nhwc.cu
index 46482b2b2f..34300f8c68 100644
--- a/test/unit/layout/tensor_nhwc.cu
+++ b/test/unit/layout/tensor_nhwc.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/test/unit/nvrtc/CMakeLists.txt b/test/unit/nvrtc/CMakeLists.txt
index 668ea35ebe..86aa42eeca 100644
--- a/test/unit/nvrtc/CMakeLists.txt
+++ b/test/unit/nvrtc/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/nvrtc/cutlass/nvrtc/environment.h b/test/unit/nvrtc/cutlass/nvrtc/environment.h
index 27e999348c..fd8bae1f82 100644
--- a/test/unit/nvrtc/cutlass/nvrtc/environment.h
+++ b/test/unit/nvrtc/cutlass/nvrtc/environment.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/nvrtc/kernel/thread/testbed_kernel.h b/test/unit/nvrtc/kernel/thread/testbed_kernel.h
index 500870581d..55edcc5518 100644
--- a/test/unit/nvrtc/kernel/thread/testbed_kernel.h
+++ b/test/unit/nvrtc/kernel/thread/testbed_kernel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/nvrtc/stdlib/stdint.h b/test/unit/nvrtc/stdlib/stdint.h
index 380216811b..7ceda345a0 100644
--- a/test/unit/nvrtc/stdlib/stdint.h
+++ b/test/unit/nvrtc/stdlib/stdint.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/nvrtc/thread/CMakeLists.txt b/test/unit/nvrtc/thread/CMakeLists.txt
index 2e12ccfa8c..cb9b189635 100644
--- a/test/unit/nvrtc/thread/CMakeLists.txt
+++ b/test/unit/nvrtc/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/nvrtc/thread/gemm_nvrtc.cu b/test/unit/nvrtc/thread/gemm_nvrtc.cu
index 785ebcb2ce..b799e6c9be 100644
--- a/test/unit/nvrtc/thread/gemm_nvrtc.cu
+++ b/test/unit/nvrtc/thread/gemm_nvrtc.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/nvrtc/thread/testbed.h b/test/unit/nvrtc/thread/testbed.h
index 41ba503ad5..1062d7a21c 100644
--- a/test/unit/nvrtc/thread/testbed.h
+++ b/test/unit/nvrtc/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/reduction/CMakeLists.txt b/test/unit/reduction/CMakeLists.txt
index 96c3716141..d53bc0c1d9 100644
--- a/test/unit/reduction/CMakeLists.txt
+++ b/test/unit/reduction/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/reduction/kernel/CMakeLists.txt b/test/unit/reduction/kernel/CMakeLists.txt
index e1983153d1..89bb511a47 100644
--- a/test/unit/reduction/kernel/CMakeLists.txt
+++ b/test/unit/reduction/kernel/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/reduction/kernel/reduce_splitk.cu b/test/unit/reduction/kernel/reduce_splitk.cu
index b169cb60f1..6a27736f96 100644
--- a/test/unit/reduction/kernel/reduce_splitk.cu
+++ b/test/unit/reduction/kernel/reduce_splitk.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/reduction/kernel/reduce_splitk_testbed.h b/test/unit/reduction/kernel/reduce_splitk_testbed.h
index 8e70407063..4e6274bec0 100644
--- a/test/unit/reduction/kernel/reduce_splitk_testbed.h
+++ b/test/unit/reduction/kernel/reduce_splitk_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/reduction/thread/CMakeLists.txt b/test/unit/reduction/thread/CMakeLists.txt
index 0641590e8c..29de471363 100644
--- a/test/unit/reduction/thread/CMakeLists.txt
+++ b/test/unit/reduction/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/reduction/thread/reduction_thread.cu b/test/unit/reduction/thread/reduction_thread.cu
index f71e30f53c..b2cf8045c3 100644
--- a/test/unit/reduction/thread/reduction_thread.cu
+++ b/test/unit/reduction/thread/reduction_thread.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/reduction/thread/testbed.h b/test/unit/reduction/thread/testbed.h
index 919839b3d6..5873d9e6a1 100644
--- a/test/unit/reduction/thread/testbed.h
+++ b/test/unit/reduction/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/test_unit.cpp b/test/unit/test_unit.cpp
index 3bb8ac1387..51e9269541 100644
--- a/test/unit/test_unit.cpp
+++ b/test/unit/test_unit.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/transform/CMakeLists.txt b/test/unit/transform/CMakeLists.txt
index a7b881ae20..d7f800f472 100644
--- a/test/unit/transform/CMakeLists.txt
+++ b/test/unit/transform/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/transform/threadblock/CMakeLists.txt b/test/unit/transform/threadblock/CMakeLists.txt
index 0d5e5c44a0..65d31daca1 100644
--- a/test/unit/transform/threadblock/CMakeLists.txt
+++ b/test/unit/transform/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/transform/threadblock/predicated_tile_iterator.cu b/test/unit/transform/threadblock/predicated_tile_iterator.cu
index 562c7888a2..be8084e20f 100644
--- a/test/unit/transform/threadblock/predicated_tile_iterator.cu
+++ b/test/unit/transform/threadblock/predicated_tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
index 8d2382e4cf..4183ed0f8e 100644
--- a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
+++ b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt
index 7f103cbf3c..9f583b821b 100644
--- a/test/unit/util/CMakeLists.txt
+++ b/test/unit/util/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/util/tensor_reduce.cu b/test/unit/util/tensor_reduce.cu
index 5a1afc7f39..d29022b16f 100644
--- a/test/unit/util/tensor_reduce.cu
+++ b/test/unit/util/tensor_reduce.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index e43c821e64..753471bf3e 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
index 4bf7577fb8..5b3cec087e 100644
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/tools/library/include/cutlass/library/handle.h b/tools/library/include/cutlass/library/handle.h
index 27d2bfe6a4..fe5ac8191e 100644
--- a/tools/library/include/cutlass/library/handle.h
+++ b/tools/library/include/cutlass/library/handle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h
index 6a018a704c..18bfce2454 100644
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@@ -571,7 +571,6 @@ struct ConvDescription : public OperationDescription {
 
 };
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Base class for all operations
@@ -933,49 +932,14 @@ struct Conv2dConfiguration {
   //  also includes (split_k_slices, groups)
   conv::Conv2dProblemSize problem_size;
 
-  /// Layout object for activations tensor
-  layout::TensorNHWC layout_activations;
+  // stride of operand A
+  std::vector<int> stride_a;
 
-  /// Layout object for filters tensor
-  layout::TensorNHWC layout_filters;
-
-  /// Layout object for source tensor
-  layout::TensorNHWC layout_source;
-
-  /// Layout object for output tensor
-  layout::TensorNHWC layout_output;
-
-  //
-  // Methods 
-  //
+  // stride of operand B
+  std::vector<int> stride_b;
 
-  // Mapping functions (A,B,C -> activation,filter,output)
-  layout::TensorNHWC layout_a(library::ConvKind const &conv_kind) const {
-    switch (conv_kind) {
-      case library::ConvKind::kFprop: return layout_activations;
-      case library::ConvKind::kDgrad: return layout_output;
-      case library::ConvKind::kWgrad: return layout_output;
-      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
-    }
-  }
-
-  layout::TensorNHWC layout_b(library::ConvKind const &conv_kind) const {
-    switch (conv_kind) {
-      case library::ConvKind::kFprop: return layout_filters;
-      case library::ConvKind::kDgrad: return layout_filters;
-      case library::ConvKind::kWgrad: return layout_activations;
-      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
-    }
-  }
-
-  layout::TensorNHWC layout_c(library::ConvKind const &conv_kind) const {
-    switch (conv_kind) {
-      case library::ConvKind::kFprop: return layout_output;
-      case library::ConvKind::kDgrad: return layout_activations;
-      case library::ConvKind::kWgrad: return layout_filters;
-      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
-    }
-  }
+  // stride of operand C
+  std::vector<int> stride_c;
 };
 
 
diff --git a/tools/library/include/cutlass/library/manifest.h b/tools/library/include/cutlass/library/manifest.h
index 2bde2884b4..99e6b79248 100644
--- a/tools/library/include/cutlass/library/manifest.h
+++ b/tools/library/include/cutlass/library/manifest.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py
index 491997cb89..681fb82837 100644
--- a/tools/library/scripts/generator.py
+++ b/tools/library/scripts/generator.py
@@ -929,10 +929,10 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args):
     operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
 
-#    conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
-#
-#    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-#      data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+    conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
+
+    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
 
     for op in operations:
       op.C.alignment = 8
@@ -1069,10 +1069,10 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args):
       operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
 
-#      conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
-#  
-#      operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-#        data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+      conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
+  
+      operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+        data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
 
       for op in operations:
         op.C.alignment = 16 
@@ -1644,10 +1644,10 @@ def GenerateSM80_TensorOp_16832_Interleaved(manifest, args):
     operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
  
-#    conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
-#
-#    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-#      data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+    conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
+
+    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
  
     for op in operations:
       op.C.alignment = 8
@@ -1825,10 +1825,10 @@ def GenerateSM80_TensorOp_16864_Interleaved(manifest, args):
     operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
  
-#    conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
-#  
-#    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-#      data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+    conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
+  
+    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
  
     for op in operations:
       op.C.alignment = 16 
@@ -2096,7 +2096,6 @@ def GenerateSM80_TensorOp_1688_complex(manifest, args):
   max_cc = 1024
 
   tile_descriptions = [
-    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
     TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
     TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
     TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
diff --git a/tools/library/scripts/library.py b/tools/library/scripts/library.py
index b9538cdbc5..5df09a8970 100644
--- a/tools/library/scripts/library.py
+++ b/tools/library/scripts/library.py
@@ -187,7 +187,6 @@ class DataType(enum.Enum):
 }
 
 ###################################################################################################
-
 #
 class ComplexTransform(enum.Enum):
   none = enum_auto()
@@ -312,7 +311,7 @@ class LayoutType(enum.Enum):
 #
 ShortLayoutTypeNames = {
   LayoutType.ColumnMajor: 'n',
-  LayoutType.ColumnMajorInterleaved32: 'n2',
+  LayoutType.ColumnMajorInterleaved2: 'n2',
   LayoutType.ColumnMajorInterleaved32: 'n32',
   LayoutType.ColumnMajorInterleaved64: 'n64',
   LayoutType.RowMajor: 't',
@@ -343,6 +342,8 @@ class OpcodeClass(enum.Enum):
   Simt = enum_auto()
   TensorOp = enum_auto()
   WmmaTensorOp = enum_auto()
+  SparseTensorOp = enum_auto()
+
 
 OpcodeClassNames = {
   OpcodeClass.Simt: 'simt',
diff --git a/tools/library/src/conv2d_operation.h b/tools/library/src/conv2d_operation.h
index 5e8f887fd1..9cc332498e 100644
--- a/tools/library/src/conv2d_operation.h
+++ b/tools/library/src/conv2d_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/conv3d_operation.h b/tools/library/src/conv3d_operation.h
index 32ad036320..6f110a46e1 100644
--- a/tools/library/src/conv3d_operation.h
+++ b/tools/library/src/conv3d_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/gemm_operation.h b/tools/library/src/gemm_operation.h
index d65e3414d5..5dd2ed2935 100644
--- a/tools/library/src/gemm_operation.h
+++ b/tools/library/src/gemm_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu
index 3f19def654..6108bdc759 100644
--- a/tools/library/src/handle.cu
+++ b/tools/library/src/handle.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h
index 4bbd21c763..218e1a3f32 100644
--- a/tools/library/src/library_internal.h
+++ b/tools/library/src/library_internal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp
index 12358dcdd3..bbfc3411f9 100644
--- a/tools/library/src/manifest.cpp
+++ b/tools/library/src/manifest.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/reduction/init_reduction_operations.cu b/tools/library/src/reduction/init_reduction_operations.cu
index 5f86b64f78..41788f5d72 100644
--- a/tools/library/src/reduction/init_reduction_operations.cu
+++ b/tools/library/src/reduction/init_reduction_operations.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/reduction/reduction_device.cu b/tools/library/src/reduction/reduction_device.cu
index e2133cc0a5..c07ba01455 100644
--- a/tools/library/src/reduction/reduction_device.cu
+++ b/tools/library/src/reduction/reduction_device.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/reference/conv2d.cu b/tools/library/src/reference/conv2d.cu
index 750ebdf31c..f115384dcf 100644
--- a/tools/library/src/reference/conv2d.cu
+++ b/tools/library/src/reference/conv2d.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/reference/conv3d.cu b/tools/library/src/reference/conv3d.cu
index 1e1544bff6..29dc880a05 100644
--- a/tools/library/src/reference/conv3d.cu
+++ b/tools/library/src/reference/conv3d.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/reference/conv_reference_operation.h b/tools/library/src/reference/conv_reference_operation.h
index 1e826ab29e..811621c125 100644
--- a/tools/library/src/reference/conv_reference_operation.h
+++ b/tools/library/src/reference/conv_reference_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -109,7 +109,19 @@ struct ConvReferenceDispatcher<
     Conv2dConfiguration const &config = 
       *static_cast<Conv2dConfiguration const *>(configuration);
 
-    ConvKind const conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+    // TODO: make below code more general.  It is fixed for NHWC now.
+    layout::TensorNHWC layout_a;
+    layout::TensorNHWC layout_b;
+    layout::TensorNHWC layout_c;
+
+    layout_a.stride() =
+        make_Coord(config.stride_a[0], config.stride_a[1], config.stride_a[2]);
+
+    layout_b.stride() =
+        make_Coord(config.stride_b[0], config.stride_b[1], config.stride_b[2]);
+
+    layout_c.stride() =
+        make_Coord(config.stride_c[0], config.stride_c[1], config.stride_c[2]);
 
     if (kProvider == Provider::kReferenceHost) {
 
@@ -127,10 +139,10 @@ struct ConvReferenceDispatcher<
       >(
         kConvolutionalOperator,
         config.problem_size,
-        {ptr_A, config.layout_a(conv_kind)},
-        {ptr_B, config.layout_b(conv_kind)},
-        {ptr_C, config.layout_c(conv_kind)},
-        {ptr_D, config.layout_c(conv_kind)},
+        {ptr_A, layout_a},
+        {ptr_B, layout_b},
+        {ptr_C, layout_c},
+        {ptr_D, layout_c},
         alpha,
         beta
       );
@@ -152,10 +164,10 @@ struct ConvReferenceDispatcher<
       >(
         kConvolutionalOperator,
         config.problem_size,
-        {ptr_A, config.layout_a(conv_kind)},
-        {ptr_B, config.layout_b(conv_kind)},
-        {ptr_C, config.layout_c(conv_kind)},
-        {ptr_D, config.layout_c(conv_kind)},
+        {ptr_A, layout_a},
+        {ptr_B, layout_b},
+        {ptr_C, layout_c},
+        {ptr_D, layout_c},
         alpha,
         beta,
         stream
diff --git a/tools/library/src/reference/gemm.cu b/tools/library/src/reference/gemm.cu
index 8e5361fd20..c95f3b5444 100644
--- a/tools/library/src/reference/gemm.cu
+++ b/tools/library/src/reference/gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/reference/gemm_reference_operation.h b/tools/library/src/reference/gemm_reference_operation.h
index 11a5230bbe..b331bb5870 100644
--- a/tools/library/src/reference/gemm_reference_operation.h
+++ b/tools/library/src/reference/gemm_reference_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/library/src/reference/initialize_reference_operations.cu b/tools/library/src/reference/initialize_reference_operations.cu
index c749c2bca9..624506d704 100644
--- a/tools/library/src/reference/initialize_reference_operations.cu
+++ b/tools/library/src/reference/initialize_reference_operations.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt
index 3ac944a9f2..bb3975c4d0 100644
--- a/tools/profiler/CMakeLists.txt
+++ b/tools/profiler/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -87,6 +87,7 @@ install(
 set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM   --operation=Gemm       --providers=cutlass --verification-providers=cublas,device      --junit-output=test_cutlass_profiler_gemm)
 set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D --operation=Conv2d     --providers=cutlass --verification-providers=cudnn,device       --junit-output=test_cutlass_profiler_conv2d)
 set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D --operation=Conv3d     --providers=cutlass --verification-providers=cudnn,device,host  --junit-output=test_cutlass_profiler_conv3d)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM --operation=SparseGemm --providers=cutlass --verification-providers=cublas,device,host --junit-output=test_cutlass_profiler_spgemm)
 cutlass_add_executable_tests(
   test_profiler cutlass_profiler
   DEPENDEES test_all
@@ -94,5 +95,6 @@ cutlass_add_executable_tests(
     CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM
     CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D
     CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM
   DISABLE_EXECUTABLE_INSTALL_RULE
   )
diff --git a/tools/profiler/src/conv2d_operation_profiler.cu b/tools/profiler/src/conv2d_operation_profiler.cu
index 4b91535719..2246e9610e 100644
--- a/tools/profiler/src/conv2d_operation_profiler.cu
+++ b/tools/profiler/src/conv2d_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -381,24 +381,9 @@ Status Conv2dOperationProfiler::initialize_configuration(
   
   conv_workspace_.configuration.split_k_mode = static_cast<conv::SplitKMode>(static_cast<int>(problem_.split_k_mode));
 
-  conv_workspace_.configuration.layout_activations.stride() = make_Coord(
-    int(problem_.c), 
-    int(problem_.w) * int(problem_.c),
-    int(problem_.h) * int(problem_.w) * int(problem_.c)
-  );
-
-  conv_workspace_.configuration.layout_filters.stride() = make_Coord(
-    int(problem_.c), 
-    int(problem_.s) * int(problem_.c),
-    int(problem_.r) * int(problem_.s) * int(problem_.c)
-  );
-
-  conv_workspace_.configuration.layout_output.stride() = make_Coord(
-    int(problem_.k), 
-    int(problem_.q) * int(problem_.k),
-    int(problem_.q) * int(problem_.p) * int(problem_.k)
-  );
-
+  conv_workspace_.set_stride_vector(
+      problem_, operation_desc.conv_kind, operation_desc.A.layout,
+      operation_desc.B.layout, operation_desc.C.layout);
 
   // initialize library::ConvArguments
   conv_workspace_.arguments.A            = nullptr;
@@ -540,9 +525,12 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
   conv_workspace_.reduction_configuration.problem_size     = problem_.eq_gemm_size(conv_kind).mn();
   conv_workspace_.reduction_configuration.partitions       = int(problem_.split_k_slices);
   conv_workspace_.reduction_configuration.partition_stride = problem_.eq_gemm_size(conv_kind).mn().product();
-  conv_workspace_.reduction_configuration.ldw              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
-  conv_workspace_.reduction_configuration.lds              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
-  conv_workspace_.reduction_configuration.ldd              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
+  conv_workspace_.reduction_configuration.ldw =
+      conv_workspace_.configuration.stride_c[tensor_c_stride_idx];
+  conv_workspace_.reduction_configuration.lds =
+      conv_workspace_.configuration.stride_c[tensor_c_stride_idx];
+  conv_workspace_.reduction_configuration.ldd =
+      conv_workspace_.configuration.stride_c[tensor_c_stride_idx];
 
   // find reduction operation 
   library::ReductionFunctionalKey reduction_key(
@@ -616,7 +604,7 @@ Status Conv2dOperationProfiler::initialize_workspace(
       operation_desc.A.element,
       operation_desc.A.layout,
       problem_.extent_a(operation_desc.conv_kind),
-      conv_workspace_.stride_a(operation_desc.conv_kind),
+      conv_workspace_.configuration.stride_a,
       conv_workspace_.problem_count
     );
 
@@ -626,7 +614,7 @@ Status Conv2dOperationProfiler::initialize_workspace(
       operation_desc.B.element,
       operation_desc.B.layout,
       problem_.extent_b(operation_desc.conv_kind),
-      conv_workspace_.stride_b(operation_desc.conv_kind),
+      conv_workspace_.configuration.stride_b,
       conv_workspace_.problem_count
     );
 
@@ -636,7 +624,7 @@ Status Conv2dOperationProfiler::initialize_workspace(
       operation_desc.C.element,
       operation_desc.C.layout,
       problem_.extent_c(operation_desc.conv_kind),
-      conv_workspace_.stride_c(operation_desc.conv_kind),
+      conv_workspace_.configuration.stride_c,
       conv_workspace_.problem_count
     );
 
@@ -645,7 +633,7 @@ Status Conv2dOperationProfiler::initialize_workspace(
       operation_desc.C.element,
       operation_desc.C.layout,
       problem_.extent_c(operation_desc.conv_kind),
-      conv_workspace_.stride_c(operation_desc.conv_kind),
+      conv_workspace_.configuration.stride_c,
       conv_workspace_.problem_count
     );
 
@@ -654,10 +642,9 @@ Status Conv2dOperationProfiler::initialize_workspace(
       operation_desc.C.element,
       operation_desc.C.layout,
       problem_.extent_c(operation_desc.conv_kind),
-      conv_workspace_.stride_c(operation_desc.conv_kind),
+      conv_workspace_.configuration.stride_c,
       conv_workspace_.problem_count
     );
-    
   }
 
   //
diff --git a/tools/profiler/src/conv2d_operation_profiler.h b/tools/profiler/src/conv2d_operation_profiler.h
index 40c003e1d4..2f99b67ce4 100644
--- a/tools/profiler/src/conv2d_operation_profiler.h
+++ b/tools/profiler/src/conv2d_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -257,42 +257,95 @@ class Conv2dOperationProfiler : public OperationProfiler {
     /// host buffer for tensor c
     std::vector<uint8_t> host_tensor_c;
 
-
     //
     // Methods
     //
 
-    Conv2dWorkspace(): 
-      A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
-
-      // Returns stride vector for tensor A
-      std::vector<int> stride_a(library::ConvKind const &conv_kind) {
-        return {        
-          configuration.layout_a(conv_kind).stride()[0],
-          configuration.layout_a(conv_kind).stride()[1],
-          configuration.layout_a(conv_kind).stride()[2]
-        };
-      }
-
-      // Returns stride vector for tensor B
-      std::vector<int> stride_b(library::ConvKind const &conv_kind) {
-
-        return {        
-          configuration.layout_b(conv_kind).stride()[0],
-          configuration.layout_b(conv_kind).stride()[1],
-          configuration.layout_b(conv_kind).stride()[2]
-        };
+    Conv2dWorkspace()
+        : A(nullptr),
+          B(nullptr),
+          C(nullptr),
+          Computed(nullptr),
+          Reference(nullptr) {}
+
+    // Set stride vector for tensor activations, filters, output
+    void set_stride_vector(Conv2dProblem const &problem,
+                           library::ConvKind const &conv_kind,
+                           library::LayoutTypeID const &layout_a,
+                           library::LayoutTypeID const &layout_b,
+                           library::LayoutTypeID const &layout_c) {
+      std::vector<int> stride_activations;
+      std::vector<int> stride_filters;
+      std::vector<int> stride_output;
+
+      // Strides for interleaved fprop
+      if (conv_kind == library::ConvKind::kFprop &&
+          ((layout_a == library::LayoutTypeID::kTensorNC32HW32 &&
+            layout_b == library::LayoutTypeID::kTensorC32RSK32 &&
+            layout_c == library::LayoutTypeID::kTensorNC32HW32) ||
+           (layout_a == library::LayoutTypeID::kTensorNC64HW64 &&
+            layout_b == library::LayoutTypeID::kTensorC64RSK64 &&
+            layout_c == library::LayoutTypeID::kTensorNC64HW64))) {
+        int interleave =
+            (layout_a == library::LayoutTypeID::kTensorNC32HW32) ? 32 : 64;
+
+        stride_activations.push_back(int(problem.w) * interleave);
+        stride_activations.push_back(int(problem.w) * int(problem.h) *
+                                     interleave);
+        stride_activations.push_back(int(problem.h) * int(problem.w) *
+                                     int(problem.c));
+
+        stride_filters.push_back(int(problem.k) * interleave);
+        stride_filters.push_back(int(problem.k) * int(problem.s) * interleave);
+        stride_filters.push_back(int(problem.k) * int(problem.s) *
+                                 int(problem.r) * interleave);
+
+        stride_output.push_back(int(problem.q) * interleave);
+        stride_output.push_back(int(problem.q) * int(problem.p) * interleave);
+        stride_output.push_back(int(problem.q) * int(problem.p) *
+                                int(problem.k));
+      } else {
+        // Strides for the rest cases
+        stride_activations.push_back(int(problem.c));
+        stride_activations.push_back(int(problem.w) * int(problem.c));
+        stride_activations.push_back(int(problem.h) * int(problem.w) *
+                                     int(problem.c));
+
+        stride_filters.push_back(int(problem.c));
+        stride_filters.push_back(int(problem.s) * int(problem.c));
+        stride_filters.push_back(int(problem.r) * int(problem.s) *
+                                 int(problem.c));
+
+        stride_output.push_back(int(problem.k));
+        stride_output.push_back(int(problem.q) * int(problem.k));
+        stride_output.push_back(int(problem.q) * int(problem.p) *
+                                int(problem.k));
       }
 
-      // Returns stride vector for tensor C
-      std::vector<int> stride_c(library::ConvKind const &conv_kind) {
-
-        return {        
-          configuration.layout_c(conv_kind).stride()[0],
-          configuration.layout_c(conv_kind).stride()[1],
-          configuration.layout_c(conv_kind).stride()[2]
-        };
+      switch (conv_kind) {
+        case library::ConvKind::kFprop:
+          configuration.stride_a = stride_activations;
+          configuration.stride_b = stride_filters;
+          configuration.stride_c = stride_output;
+
+          break;
+        case library::ConvKind::kDgrad:
+          configuration.stride_a = stride_output;
+          configuration.stride_b = stride_filters;
+          configuration.stride_c = stride_activations;
+
+          break;
+        case library::ConvKind::kWgrad:
+          configuration.stride_a = stride_output;
+          configuration.stride_b = stride_activations;
+          configuration.stride_c = stride_filters;
+
+          break;
+        default:
+          throw std::runtime_error(
+              "Invalid Conv Operator (fprop, dgrad, wgrad)");
       }
+    }
   };
 
 protected:
diff --git a/tools/profiler/src/conv3d_operation_profiler.cu b/tools/profiler/src/conv3d_operation_profiler.cu
index 67f21d8f7a..6e45759abf 100644
--- a/tools/profiler/src/conv3d_operation_profiler.cu
+++ b/tools/profiler/src/conv3d_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/conv3d_operation_profiler.h b/tools/profiler/src/conv3d_operation_profiler.h
index 04c2a15e82..2192a984c6 100644
--- a/tools/profiler/src/conv3d_operation_profiler.h
+++ b/tools/profiler/src/conv3d_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/cublas_helpers.cpp b/tools/profiler/src/cublas_helpers.cpp
index 3369d9615a..30db20e22c 100644
--- a/tools/profiler/src/cublas_helpers.cpp
+++ b/tools/profiler/src/cublas_helpers.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/cublas_helpers.h b/tools/profiler/src/cublas_helpers.h
index c2bf13b5f7..ec1bf0dbba 100644
--- a/tools/profiler/src/cublas_helpers.h
+++ b/tools/profiler/src/cublas_helpers.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/cudnn_helpers.cpp b/tools/profiler/src/cudnn_helpers.cpp
index 86f18095bf..838a41a055 100644
--- a/tools/profiler/src/cudnn_helpers.cpp
+++ b/tools/profiler/src/cudnn_helpers.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/cudnn_helpers.h b/tools/profiler/src/cudnn_helpers.h
index 58fe4e678f..c93fbc93e8 100644
--- a/tools/profiler/src/cudnn_helpers.h
+++ b/tools/profiler/src/cudnn_helpers.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu
index c1e33ad61e..c53e8c221f 100644
--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -167,6 +167,7 @@ void CutlassProfiler::print_usage_(std::ostream &out) {
     << "  $ cutlass_profiler --operation=Gemm --help\n\n"
     << "  $ cutlass_profiler --operation=Conv3d --help\n\n"         
     << "  $ cutlass_profiler --operation=Conv2d --help\n\n"         
+    << "  $ cutlass_profiler --operation=SparseGemm --help\n\n"
   ;
 }
 
diff --git a/tools/profiler/src/cutlass_profiler.h b/tools/profiler/src/cutlass_profiler.h
index d3b592a4ea..8bd44a893d 100644
--- a/tools/profiler/src/cutlass_profiler.h
+++ b/tools/profiler/src/cutlass_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/debug.h b/tools/profiler/src/debug.h
index aed11ca188..7bf5b8e761 100644
--- a/tools/profiler/src/debug.h
+++ b/tools/profiler/src/debug.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu
index 247bcccf15..38a4acbe59 100644
--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/device_allocation.h b/tools/profiler/src/device_allocation.h
index b7bb5ec729..0aa9d0ecd1 100644
--- a/tools/profiler/src/device_allocation.h
+++ b/tools/profiler/src/device_allocation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/device_context.cu b/tools/profiler/src/device_context.cu
index a8bd4fa218..3ab6b4c796 100644
--- a/tools/profiler/src/device_context.cu
+++ b/tools/profiler/src/device_context.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -82,6 +82,9 @@ DeviceAllocation *DeviceContext::allocate_tensor(
     if(!options.initialization.fix_data_distribution) {
       // change data distribution based on bit width
       switch(type) {
+        case library::NumericTypeID::kF16:
+          data_distribution.set_uniform(-3, 3, 0);
+          break;
         case library::NumericTypeID::kB1:
           data_distribution.set_uniform(0, 1, 0);
           break;
diff --git a/tools/profiler/src/device_context.h b/tools/profiler/src/device_context.h
index 1633a2dd29..5e74f07e20 100644
--- a/tools/profiler/src/device_context.h
+++ b/tools/profiler/src/device_context.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/enumerated_types.cpp b/tools/profiler/src/enumerated_types.cpp
index 29be6f8baf..0b7b21ba04 100644
--- a/tools/profiler/src/enumerated_types.cpp
+++ b/tools/profiler/src/enumerated_types.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/enumerated_types.h b/tools/profiler/src/enumerated_types.h
index e7e713bdbf..6b8429c49f 100644
--- a/tools/profiler/src/enumerated_types.h
+++ b/tools/profiler/src/enumerated_types.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu
index cf7f8ff64c..63bbc32a99 100644
--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -659,7 +659,7 @@ bool GemmOperationProfiler::verify_with_cublas_(
     gemm_workspace_.arguments.B = gemm_workspace_.B->data();
     gemm_workspace_.arguments.batch_stride_B = gemm_workspace_.B->batch_stride();
     gemm_workspace_.arguments.C = gemm_workspace_.Reference->data();
-    gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Reference->batch_stride();
+    gemm_workspace_.arguments.batch_stride_C = gemm_workspace_.Reference->batch_stride();
     gemm_workspace_.arguments.D = gemm_workspace_.Reference->data();
     gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Reference->batch_stride();
     gemm_workspace_.arguments.alpha = problem_.alpha.data();
diff --git a/tools/profiler/src/gemm_operation_profiler.h b/tools/profiler/src/gemm_operation_profiler.h
index 1c6c5e7ceb..1adc88968d 100644
--- a/tools/profiler/src/gemm_operation_profiler.h
+++ b/tools/profiler/src/gemm_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/gpu_timer.cpp b/tools/profiler/src/gpu_timer.cpp
index eb3a841150..a6297b025b 100644
--- a/tools/profiler/src/gpu_timer.cpp
+++ b/tools/profiler/src/gpu_timer.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/gpu_timer.h b/tools/profiler/src/gpu_timer.h
index 5cd4b0037f..79d8760c2f 100644
--- a/tools/profiler/src/gpu_timer.h
+++ b/tools/profiler/src/gpu_timer.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/main.cpp b/tools/profiler/src/main.cpp
index a1e523111d..4f76a1119d 100644
--- a/tools/profiler/src/main.cpp
+++ b/tools/profiler/src/main.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu
index edd6f07ce2..f50b77e1b6 100644
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/operation_profiler.h b/tools/profiler/src/operation_profiler.h
index 731554b6f2..c47741290b 100644
--- a/tools/profiler/src/operation_profiler.h
+++ b/tools/profiler/src/operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu
index 6bac578072..eeb7814703 100644
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/options.h b/tools/profiler/src/options.h
index 79e0169970..69d93ae257 100644
--- a/tools/profiler/src/options.h
+++ b/tools/profiler/src/options.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp
index de184eb04b..afe2debe0e 100644
--- a/tools/profiler/src/performance_report.cpp
+++ b/tools/profiler/src/performance_report.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/performance_report.h b/tools/profiler/src/performance_report.h
index 5005103158..a2fe5baa3a 100644
--- a/tools/profiler/src/performance_report.h
+++ b/tools/profiler/src/performance_report.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/performance_result.cu b/tools/profiler/src/performance_result.cu
index 86cabfb753..1a01aa2b06 100644
--- a/tools/profiler/src/performance_result.cu
+++ b/tools/profiler/src/performance_result.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/performance_result.h b/tools/profiler/src/performance_result.h
index 9e3ebeb5ce..e5dc6a5c95 100644
--- a/tools/profiler/src/performance_result.h
+++ b/tools/profiler/src/performance_result.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/problem_space.cpp b/tools/profiler/src/problem_space.cpp
index a8c4943218..910764e55d 100644
--- a/tools/profiler/src/problem_space.cpp
+++ b/tools/profiler/src/problem_space.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/problem_space.h b/tools/profiler/src/problem_space.h
index 8e10dbafce..cf4e766234 100644
--- a/tools/profiler/src/problem_space.h
+++ b/tools/profiler/src/problem_space.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.cu b/tools/profiler/src/sparse_gemm_operation_profiler.cu
index 7eff2062b0..aa960ec3df 100644
--- a/tools/profiler/src/sparse_gemm_operation_profiler.cu
+++ b/tools/profiler/src/sparse_gemm_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.h b/tools/profiler/src/sparse_gemm_operation_profiler.h
index 37905d3b88..9ae62d24cb 100644
--- a/tools/profiler/src/sparse_gemm_operation_profiler.h
+++ b/tools/profiler/src/sparse_gemm_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/CMakeLists.txt b/tools/util/CMakeLists.txt
index 0d2f86fb99..db4dc3d9b3 100644
--- a/tools/util/CMakeLists.txt
+++ b/tools/util/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h
index c158ef9768..31187a7969 100644
--- a/tools/util/include/cutlass/util/command_line.h
+++ b/tools/util/include/cutlass/util/command_line.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are not permitted.
diff --git a/tools/util/include/cutlass/util/debug.h b/tools/util/include/cutlass/util/debug.h
index 3ebbd4d843..e10e91459a 100644
--- a/tools/util/include/cutlass/util/debug.h
+++ b/tools/util/include/cutlass/util/debug.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/device_dump.h b/tools/util/include/cutlass/util/device_dump.h
index dac6029c41..1028d5d584 100644
--- a/tools/util/include/cutlass/util/device_dump.h
+++ b/tools/util/include/cutlass/util/device_dump.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h
index 79b123687a..424a0e6f09 100644
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are not permitted.
diff --git a/tools/util/include/cutlass/util/distribution.h b/tools/util/include/cutlass/util/distribution.h
index 0337737747..8e4ea159c4 100644
--- a/tools/util/include/cutlass/util/distribution.h
+++ b/tools/util/include/cutlass/util/distribution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/exceptions.h b/tools/util/include/cutlass/util/exceptions.h
index 519205f6d2..d8d6ef94cc 100644
--- a/tools/util/include/cutlass/util/exceptions.h
+++ b/tools/util/include/cutlass/util/exceptions.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are not permitted.
diff --git a/tools/util/include/cutlass/util/host_reorder.h b/tools/util/include/cutlass/util/host_reorder.h
index 660ee0f956..1935e390c4 100644
--- a/tools/util/include/cutlass/util/host_reorder.h
+++ b/tools/util/include/cutlass/util/host_reorder.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -62,15 +62,15 @@ void reorder_column(TensorRef<Element, Layout> dest,
   }
 }
 
-template <int Interleaved, typename Element, typename Layout>
+template <int ColumnInterleaved, int LayoutInterleaved = ColumnInterleaved, typename Element, typename Layout>
 void reorder_convK(TensorRef<Element, Layout> dest,
                     TensorRef<Element, Layout> src,
                     cutlass::gemm::GemmCoord problem_size) {
 
-    TensorRef<Element, layout::RowMajorInterleaved<Interleaved>> mappedDest(dest.data(), dest.stride(0));
-    TensorRef<Element, layout::RowMajorInterleaved<Interleaved>> mappedSrc(src.data(), src.stride(0));
+    TensorRef<Element, layout::RowMajorInterleaved<LayoutInterleaved>> mappedDest(dest.data(), dest.stride(0));
+    TensorRef<Element, layout::RowMajorInterleaved<LayoutInterleaved>> mappedSrc(src.data(), src.stride(0));
     
-    reorder_column<Interleaved>(
+    reorder_column<ColumnInterleaved>(
         mappedDest, mappedSrc, problem_size);
 }
 
diff --git a/tools/util/include/cutlass/util/host_tensor.h b/tools/util/include/cutlass/util/host_tensor.h
index 465d74a93b..f105434fde 100644
--- a/tools/util/include/cutlass/util/host_tensor.h
+++ b/tools/util/include/cutlass/util/host_tensor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -238,7 +238,7 @@ class HostTensor {
   Element * host_data() { return host_.data(); }
 
   /// Gets pointer to host data with a pointer offset
-  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return host_.data() + ptr_element_offset; }
+  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(host_.data(), ptr_element_offset); }
 
   /// Gets a reference to an element in host memory
   Reference host_data(LongIndex idx) {
@@ -257,7 +257,7 @@ class HostTensor {
   Element * device_data() { return device_.get(); }
 
   /// Gets pointer to device data with a pointer offset
-  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return device_.get() + ptr_element_offset; }
+  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
 
   /// Gets pointer to device data
   Element const * device_data() const { return device_.get(); }
diff --git a/tools/util/include/cutlass/util/host_tensor_planar_complex.h b/tools/util/include/cutlass/util/host_tensor_planar_complex.h
index 6bdc8fe47b..50919a1759 100644
--- a/tools/util/include/cutlass/util/host_tensor_planar_complex.h
+++ b/tools/util/include/cutlass/util/host_tensor_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/host_uncompress.h b/tools/util/include/cutlass/util/host_uncompress.h
index 8b630030e5..7f5e8213ba 100644
--- a/tools/util/include/cutlass/util/host_uncompress.h
+++ b/tools/util/include/cutlass/util/host_uncompress.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/detail/inner_product.h b/tools/util/include/cutlass/util/reference/detail/inner_product.h
index f75f8b8884..7fdc2462e8 100644
--- a/tools/util/include/cutlass/util/reference/detail/inner_product.h
+++ b/tools/util/include/cutlass/util/reference/detail/inner_product.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h b/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
index db00e712ed..67e0e1aa10 100644
--- a/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
+++ b/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/convolution.h b/tools/util/include/cutlass/util/reference/device/convolution.h
index 843b6b15b9..9d814263e7 100644
--- a/tools/util/include/cutlass/util/reference/device/convolution.h
+++ b/tools/util/include/cutlass/util/reference/device/convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/gemm.h b/tools/util/include/cutlass/util/reference/device/gemm.h
index 3e4bfb31b6..93ecdda4e2 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
index 7c736603bb..7ad38a84db 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
index b9bdbfa026..0ff572a270 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
index 3b9688d17a..0e5c668ebb 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
index 4d9de5156e..67ddfdc4f0 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
index 8d813ea243..7524b740de 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
index eb61754e47..9aa0a4f923 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_fill.h b/tools/util/include/cutlass/util/reference/device/tensor_fill.h
index ff2e5f3666..09ead0ef4d 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_fill.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
index 54621006e1..f9031c5cae 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_reduce.h b/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
index a268c92526..c8f279c066 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_relu.h b/tools/util/include/cutlass/util/reference/device/tensor_relu.h
index d78e19533e..8717c921a5 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_relu.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_relu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/thread/gemm.h b/tools/util/include/cutlass/util/reference/device/thread/gemm.h
index 318e6c8368..880b1a12a9 100644
--- a/tools/util/include/cutlass/util/reference/device/thread/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/thread/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/convolution.h b/tools/util/include/cutlass/util/reference/host/convolution.h
index 48f5db81ea..f69ba174bc 100644
--- a/tools/util/include/cutlass/util/reference/host/convolution.h
+++ b/tools/util/include/cutlass/util/reference/host/convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/gemm.h b/tools/util/include/cutlass/util/reference/host/gemm.h
index 6381aa3066..628961e41f 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/gemm_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_complex.h
index 473115ff87..a195ece7ae 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
index 127c501bd3..6fe9d8e0ae 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_compare.h b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
index 2d7545e907..faa1177590 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_copy.h b/tools/util/include/cutlass/util/reference/host/tensor_copy.h
index a81f021127..ec62515c35 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_copy.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_copy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
index 88bbb39f45..9dd8995a61 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_fill.h b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
index 1a0230b55d..7904b746fd 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
index feb439d724..e0dc000c01 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_norm.h b/tools/util/include/cutlass/util/reference/host/tensor_norm.h
index c2958e32e3..549167f8d5 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_norm.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_norm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_reduce.h b/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
index dd1d4fda66..2d41791576 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/tensor_view_io.h b/tools/util/include/cutlass/util/tensor_view_io.h
index 0043d745c2..a097e637ea 100644
--- a/tools/util/include/cutlass/util/tensor_view_io.h
+++ b/tools/util/include/cutlass/util/tensor_view_io.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/type_traits.h b/tools/util/include/cutlass/util/type_traits.h
index d97af0a421..e4c8951caf 100644
--- a/tools/util/include/cutlass/util/type_traits.h
+++ b/tools/util/include/cutlass/util/type_traits.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met: