diff --git a/CHANGELOG.md b/CHANGELOG.md
index 96053eefb0..eded0a4ef0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,17 @@
 # NVIDIA CUTLASS Changelog
 
 # CUTLASS 2.x
+## [2.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.4.0) (2020-11-19)
+  * Implicit GEMM convolution kernels supporting CUDA and Tensor Cores on NVIDIA GPUs
+    * Operators: forward (Fprop), backward data gradient (Dgrad), and backward weight gradient (Wgrad) convolution
+    * Data type: FP32, complex<FP32>, Tensor Float 32 (TF32), BFloat16 (BF16), Float16, Int4, Int8, Int32
+    * Spatial dimensions: 1-D, 2-D, and 3-D
+    * Layout: NHWC, NCxHWx
+  * Implicit GEMM convolution components: 
+    * Global memory iterators supporting fprop, dgrad, and wgrad
+    * `MmaMultistage` for implicit GEMM convolution for NVIDIA Ampere architecture
+    * `MmaPipeline` for implicit GEMM convolution for NVIDIA Volta and Turing architectures
+    * [Documentation](/media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation
 
 ## [2.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.3.0) (2020-09-23)
  * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d853a9dd3c..a0ece82c6d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,7 @@ endif()
 
 message(STATUS "CMake Version: ${CMAKE_VERSION}")
 
-project(CUTLASS VERSION 2.3.0 LANGUAGES CXX)
+project(CUTLASS VERSION 2.4.0 LANGUAGES CXX)
 include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
 
 find_package(Doxygen QUIET)
@@ -137,7 +137,12 @@ if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES))
 endif()
 
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-set(CUTLASS_LIBRARY_DEBUG_POSTFIX ".debug" CACHE STRING "Default postfix value for debug libraries")
+if (DEFINED CMAKE_DEBUG_POSTFIX)
+  set(CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT ${CMAKE_DEBUG_POSTFIX})
+else()
+  set(CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT .debug)
+endif()
+set(CUTLASS_LIBRARY_DEBUG_POSTFIX ${CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT} CACHE STRING "Default postfix value for debug libraries")
 
 if(WIN32)
   # On Windows we link against the shared (DLL) runtime. Change gtest settings to match this.
@@ -192,7 +197,6 @@ endif()
 set(CUTLASS_DEBUG_TRACE_LEVEL "0" CACHE STRING "Level of debug tracing to perform.")
 list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_DEBUG_TRACE_LEVEL=${CUTLASS_DEBUG_TRACE_LEVEL})
 
-
 set(CUTLASS_ENABLE_TENSOR_CORE_MMA ${CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT} CACHE BOOL
   "Enable PTX mma instruction for collective matrix multiply operations.")
 
@@ -466,21 +470,195 @@ if (CUTLASS_ENABLE_CUBLAS)
   target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUBLAS=1)
 endif()
 
+include(${CMAKE_CURRENT_SOURCE_DIR}/cuDNN.cmake)
+
+if (CUTLASS_ENABLE_CUDNN)
+  target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUDNN=1)
+endif()
+
 ################################################################################
 
-if(CUTLASS_ENABLE_TOOLS)
+include(CTest)
+enable_testing()
+if (NOT TARGET test_all)
+  add_custom_target(test_all)
+endif()
+
+set(CUTLASS_INSTALL_TESTS ON CACHE BOOL "Install test executables")
+set(CUTLASS_TEST_EXECUTION_ENVIRONMENT "" CACHE BOOL "Environment in which to invoke unit test executables")
+
+set(CMAKE_TEST_INSTALL_PREFIX test CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+set(CUTLASS_TEST_INSTALL_PREFIX ${CMAKE_TEST_INSTALL_PREFIX}/cutlass CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+set(CUTLASS_TEST_INSTALL_BINDIR ${CUTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+set(CUTLASS_TEST_INSTALL_LIBDIR ${CUTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+
+install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX})
+install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR})
+install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_LIBDIR})
+install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest)
+
+set(CUTLASS_CTEST_TEMPLATE_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/CTestTestfile.config.cmake)
+set(CUTLASS_CTEST_GENERATED_FILES "" CACHE INTERNAL "")
+
+function(cutlass_add_executable_tests NAME TARGET)
+# 
+# Generates test rules for `make test`, `make test_all`, and `ctest` invoked from either the 
+# <CMAKE_BINARY_DIR> or the <CMAKE_INSTALL_PREFIX>/<CUTLASS_TEST_INSTALL_PREFIX> after installation.
+# 
+# NAME: The base name for the test. Can be run with `make <NAME>` or `ctest -R 'c<NAME>'`.
+# TARGET: The target corresponding to the executable under test.
+# DISABLE_EXECUTABLE_INSTALL_RULE: An option, if given, that disables creating an install rule for TARGET.
+# DEPENDS: A list of targets or files on which this test is dependent.
+# DEPENDEES: A list of targets which should depend on this test.
+# TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments
+#   to pass to the test executable. A unique test with suffix _0, _1, ... is generated for each set of 
+#   options given. If this option is not used, a single test with no arguments is generated.
+# 
+
+  set(options DISABLE_EXECUTABLE_INSTALL_RULE)
+  set(oneValueArgs)
+  set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    
+  if (NOT __DISABLE_EXECUTABLE_INSTALL_RULE AND CUTLASS_INSTALL_TESTS)
+  
+    # file(RELATIVE_PATH CMAKE_CURRENT_BINARY_RELATIVE_DIR ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+  
+    install(
+      TARGETS ${TARGET}
+      RUNTIME DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR}
+      )
+  
+  endif()
+
+  if (NOT __TEST_COMMAND_OPTIONS)
+    set(__TEST_COMMAND_OPTIONS " ")
+  endif()
+
+  list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT)
+  set(CMD_IDX 0)
+
+  if (CMD_COUNT GREATER 1)
+    add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS})
+    foreach(DEPENDEE ${__DEPENDEES})
+      add_dependencies(${DEPENDEE} ${NAME})
+    endforeach()
+  endif()
+
+  foreach(CMD_OPTIONS ${__TEST_COMMAND_OPTIONS})
+
+    if (CMD_COUNT GREATER 1)
+      set(TEST_NAME ${NAME}_${CMD_IDX})
+    else()
+      set(TEST_NAME ${NAME})
+    endif()
+
+    # The following rigmarole is needed to deal with spaces and possible quotes in 
+    # command line arguments. The options are passed "by reference" as the actual
+    # variable names holding the real options. We then expand these in a way that
+    # preserves any quotes. Note, they have to be in this order for it to work for 
+    # all the use cases below.
+
+    set(CMD_OPTIONS ${${CMD_OPTIONS}})
+    list(JOIN CMD_OPTIONS " " TEST_COMMAND_OPTIONS)
+    separate_arguments(CMD_OPTIONS)
+   
+    add_custom_target(
+      ${TEST_NAME}
+      COMMAND
+      ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
+      DEPENDS
+      ${TARGET}
+      )
+
+    if (CMD_COUNT GREATER 1)
+      add_dependencies(${NAME} ${TEST_NAME})
+    endif()
+
+    foreach(DEPENDEE ${__DEPENDEES})
+      add_dependencies(${DEPENDEE} ${TEST_NAME})
+    endforeach()
+
+    add_test(
+      NAME c${TEST_NAME}
+      COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
+      )
+
+    if (CUTLASS_INSTALL_TESTS)
+
+      # To run the tests from an install package with tests enabled, we need to generate test files
+      # that don't rely on the current directory structure in build.  
+
+      set(TEST_NAME c${TEST_NAME})
+      set(TEST_EXE $<TARGET_FILE_NAME:${TARGET}>)
+      set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR})
+      configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)
+
+      file(GENERATE 
+        OUTPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake" 
+        INPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
+        )
+  
+      install(
+        FILES "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake"
+        DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest/
+        )
+  
+      set(CUTLASS_CTEST_GENERATED_FILES ${CUTLASS_CTEST_GENERATED_FILES};ctest/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")
+  
+    endif()
+
+    math(EXPR CMD_IDX "${CMD_IDX} + 1")
+
+  endforeach()
+
+endfunction()
+
+if (CUTLASS_ENABLE_TOOLS)
   add_subdirectory(tools)
+  if (CUTLASS_ENABLE_PROFILER)
+    add_dependencies(test_all test_profiler)
+  endif()  
 endif()
-if(CUTLASS_ENABLE_EXAMPLES)
+if (CUTLASS_ENABLE_EXAMPLES)
   add_subdirectory(examples)
+  add_dependencies(test_all test_examples)
 endif()
 
-if(CUTLASS_ENABLE_TESTS)
-  include(CTest)
-  enable_testing()
+if (CUTLASS_ENABLE_TESTS)
   add_subdirectory(test)
+  add_dependencies(test_all test_unit)
+endif()
+
+if (CUTLASS_INSTALL_TESTS)
+
+  file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/cmake")
+
+  file(WRITE "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake" "# Generated File\n")
+  foreach(GENERATED_FILE ${CUTLASS_CTEST_GENERATED_FILES})
+    file(APPEND "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake" "include(${GENERATED_FILE})\n")
+  endforeach()
+
+  install(
+    FILES "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake"
+    DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/
+    )
+
 endif()
 
+#? install(
+#?   FILES ${CMAKE_BINARY_DIR}/CTestTestfile.cmake
+#?   DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/
+#?   )
+#? 
+#? install(
+#?   DIRECTORY 
+#?     ${CMAKE_BINARY_DIR}/tools
+#?     ${CMAKE_BINARY_DIR}/test
+#?   DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/
+#?   FILES_MATCHING PATTERN "CTestTestfile.cmake"
+#?   )
+
 ################################################################################
 
 install(
diff --git a/README.md b/README.md
index 88a1b40706..d7a1d7d475 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS 2.3
+# CUTLASS 2.4
 
-_CUTLASS 2.3 - September 2020_
+_CUTLASS 2.4 - November 2020_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
@@ -25,11 +25,22 @@ Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply operations
 targeting the  programmable, high-throughput _Tensor Cores_ implemented by 
 NVIDIA's Volta, Turing, and Ampere architectures.
 
+Additionaly, CUTLASS implements high-performance convolution (implicit GEMM). 
+Implicit GEMM is the formulation of a convolution operation as a GEMM. This allows CUTLASS 
+to build convolutions by reusing highly optimized warp-wide GEMM components and below. 
+
 See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
 
-See the [functionality listing](media/docs/functionality.md) for the list of operations
+See the [functionality listing](/media/docs/functionality.md) for the list of operations
 supported at each level of the execution model hierarchy.
 
+# What's New in CUTLASS 2.4
+CUTLASS 2.4 is a significant update to CUTLASS adding:
+- 1-D, 2-D, and 3-D convolution targeting Tensor and CUDA cores for NVIDIA Ampere, Turing, and Volta GPU architectures
+- CUTLASS profiler support for convolution
+- [Documentation](/media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation
+- See the [CHANGELOG](CHANGELOG.md) for more details.
+
 # What's New in CUTLASS 2.3
 
 CUTLASS 2.3 is a minor update to CUTLASS adding:
@@ -118,6 +129,7 @@ CUTLASS is described in the following documents and the accompanying
 - [Functionality](/media/docs/functionality.md) - summarizes functionality available in CUTLASS
 - [Efficient GEMM in CUDA](media/docs/efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA
 - [GEMM API](media/docs/gemm_api.md) - describes the CUTLASS GEMM model and C++ template concepts 
+- [Implicit GEMM Convolution](media/docs/implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS
 - [Code Organization](media/docs/code_organization.md) - describes the organization and contents of the CUTLASS project
 - [Terminology](media/docs/terminology.md) - describes terms used in the code
 - [Programming Guidelines](media/docs/programming_guidelines.md) - guidelines for writing efficient modern CUDA C++
@@ -140,7 +152,7 @@ CUTLASS unit tests, examples, and utilities can be build with CMake starting ver
 Make sure the `CUDACXX` environment  variable points to NVCC in the CUDA Toolkit installed
 on your system.
 
-```
+```bash
 $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
 ```
 
@@ -149,7 +161,7 @@ for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, and 8.6. To reduce
 the architectures to build CUTLASS for by changing the CMake configuration setting
 `CUTLASS_NVCC_ARCHS`.
 
-```
+```bash
 $ mkdir build && cd build
 
 $ cmake .. -DCUTLASS_NVCC_ARCHS=80               # compiles for NVIDIA's Ampere Architecture
@@ -160,7 +172,7 @@ From the `build/` directory, compile and run the CUTLASS unit tests by building
 The unit tests are organized as several binaries mirroring the top-level namespaces of CUTLASS,
 and they may be executed in parallel via make's `-j` command line argument.
 
-```
+```bash
 $ make test_unit -j
 ...
 ...
@@ -191,6 +203,8 @@ include/                     # client applications should target this directory
 
     arch/                    # direct exposure of architecture features (including instruction-level GEMMs)
 
+    conv/                    # code specialized for convolution
+
     gemm/                    # code specialized for general matrix product computations
 
     layout/                  # layout definitions for matrices, tensors, and other mathematical objects in memory
@@ -210,34 +224,39 @@ include/                     # client applications should target this directory
 
 ```
 examples/
-  00_basic_gemm/             # launches a basic GEMM with single precision inputs and outputs
+  00_basic_gemm/                   # launches a basic GEMM with single precision inputs and outputs
 
-  01_cutlass_utilities/      # demonstrates CUTLASS Utilities for allocating and initializing tensors
+  01_cutlass_utilities/            # demonstrates CUTLASS Utilities for allocating and initializing tensors
   
-  02_dump_reg_smem/          # debugging utilities for printing register and shared memory contents
+  02_dump_reg_smem/                # debugging utilities for printing register and shared memory contents
   
-  03_visualize_layout/       # utility for visualizing all layout functions in CUTLASS
+  03_visualize_layout/             # utility for visualizing all layout functions in CUTLASS
+
+  04_tile_iterator/                # example demonstrating an iterator over tiles in memory
+
+  05_batched_gemm/                 # example demonstrating CUTLASS's batched strided GEMM operation
 
-  04_tile_iterator/          # example demonstrating an iterator over tiles in memory
+  06_splitK_gemm/                  # exmaple demonstrating CUTLASS's Split-K parallel reduction kernel
 
-  05_batched_gemm/           # example demonstrating CUTLASS's batched strided GEMM operation
+  07_volta_tensorop_gemm/          # example demonstrating mixed precision GEMM using Volta Tensor Cores
 
-  06_splitK_gemm/            # exmaple demonstrating CUTLASS's Split-K parallel reduction kernel
+  08_turing_tensorop_gemm/         # example demonstrating integer GEMM using Turing Tensor Cores
 
-  07_volta_tensorop_gemm/    # example demonstrating mixed precision GEMM using Volta Tensor Cores
+  09_turing_tensorop_conv2dfprop/  # example demonstrating integer implicit GEMM convolution (forward propagation) using Turing Tensor Cores
 
-  08_turing_tensorop_gemm/   # example demonstrating integer GEMM using Turing Tensor Cores
+  10_planar_complex/               # example demonstrating planar complex GEMM kernels
 
-  10_planar_complex/         # example demonstrating planar complex GEMM kernels
+  11_planar_complex_array/         # example demonstrating planar complex kernels with batch-specific problem sizes
 
-  11_planar_complex_array/   # example demonstrating planar complex kernels with batch-specific problem sizes
+  12_gemm_bias_relu/               # example demonstrating GEMM fused with bias and relu
 
-  12_gemm_bias_relu/         # example demonstrating GEMM fused with bias and relu
+  13_fused_two_gemms/              # example demonstrating two GEMms fused in one kernel
 
-  13_fused_two_gemms/        # example demonstrating two GEMms fused in one kernel
+  22_ampere_tensorop_conv2dfprop/  # example demonstrating integer implicit GEMM convolution (forward propagation) using Ampere Tensor Cores
 ```
 
 ### Tools
+
 ```
 tools/
   library/                   # CUTLASS Instance Library - contains instantiations of all supported CUTLASS templates
@@ -266,14 +285,14 @@ Instructions for building and running the Unit tests are described in the [Quick
 The `tools/profiler/` directory contains a command-line utility for launching each of the GEMM kernels.
 It can be built as follows:
 
-```
+```bash
 $ make cutlass_profiler -j16
 ```
 
 By default, only one tile size is instantiated for each data type, math instruction, and layout.
 To instantiate all, set the following environment variable when running CMake from an empty `build/` directory.
 Beware, this results in *thousands* of kernels and long build times.
-```
+```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=all
 ...
 $ make cutlass_profiler -j16
@@ -282,7 +301,7 @@ $ make cutlass_profiler -j16
 To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with 
 wildcard characters may be reduce the set of kernels. The following builds exactly one kernel:
 
-```
+```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
 ...
 $ make cutlass_profiler -j16
@@ -318,6 +337,56 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
             Math: 17218.4 GFLOP/s
 ```
 
+To compile strictly 2-D or 3-D convolution kernels, filter by operation
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_OPERATIONS=conv2d,conv3d
+...
+$ make cutlass_profiler -j16
+```
+
+or by name
+
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_KERNELS=sfprop,s16816fprop,s16816dgrad,s16816wgrad
+...
+$ make cutlass_profiler -j16
+```
+
+Example command line for profiling 2-D convolution kernels is as follows:
+
+```bash
+$ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3
+
+
+=============================
+  Problem ID: 1
+
+        Provider: CUTLASS
+   OperationKind: conv2d
+       Operation: cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
+
+          Status: Success
+    Verification: ON
+     Disposition: Passed
+
+reference_device: Passed
+
+       Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1  \
+                  --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f32:nhwc --Filter=f32:nhwc --Output=f32:nhwc  \
+                  --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1  \
+                  --eq_gemm_provider=none --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4  \
+                  --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024
+
+           Bytes: 2055798784  bytes
+           FLOPs: 118482796544  flops
+
+         Runtime: 8.13237  ms
+          Memory: 235.431 GiB/s
+
+            Math: 14569.3 GFLOP/s
+
+```
+
 [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)
 
 
diff --git a/cmake/CTestTestfile.config.cmake b/cmake/CTestTestfile.config.cmake
new file mode 100644
index 0000000000..65fda51a70
--- /dev/null
+++ b/cmake/CTestTestfile.config.cmake
@@ -0,0 +1,19 @@
+# Generated file
+
+if (DEFINED ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT})
+  set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT $ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT})
+else()
+  set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT @CUTLASS_TEST_EXECUTION_ENVIRONMENT@)
+endif()
+
+if (NOT "@TEST_EXE_DIR@" STREQUAL "")
+  set(TEST_EXE_PATH @TEST_EXE_DIR@/@TEST_EXE@)
+else()
+  set(TEST_EXE_PATH @TEST_EXE@)
+endif()
+
+add_test("@TEST_NAME@" ${_CUTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}" @TEST_COMMAND_OPTIONS@)
+
+if (NOT "@TEST_EXE_WORKING_DIRECTORY@" STREQUAL "")
+  set_tests_properties("@TEST_NAME@" PROPERTIES WORKING_DIRECTORY "@TEST_EXE_WORKING_DIRECTORY@")
+endif()
diff --git a/cuBLAS.cmake b/cuBLAS.cmake
index 4c73a1db4c..0ad6db2378 100644
--- a/cuBLAS.cmake
+++ b/cuBLAS.cmake
@@ -1,3 +1,24 @@
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 message(STATUS "Configuring cublas ...")
 
diff --git a/cuDNN.cmake b/cuDNN.cmake
new file mode 100644
index 0000000000..da5e453131
--- /dev/null
+++ b/cuDNN.cmake
@@ -0,0 +1,107 @@
+
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+if(DEFINED CUDNN_ENABLED)
+    set(CUTLASS_ENABLE_CUDNN ${CUDNN_ENABLED} CACHE BOOL "Enable CUTLASS to build with cuDNN library.")
+endif()
+
+if(DEFINED CUTLASS_ENABLE_CUDNN AND NOT CUTLASS_ENABLE_CUDNN)
+  return()
+endif()
+  
+message(STATUS "Configuring cuDNN ...")
+
+find_path(
+    _CUDNN_INCLUDE_DIR cudnn.h
+    PATHS
+    ${CUDA_TOOLKIT_ROOT_DIR}/include
+    $ENV{CUDNN_PATH}/include
+    $ENV{CUDA_PATH}/include
+    ${CUDNN_PATH}/include
+    /usr/include)
+
+find_library(
+    _CUDNN_LIBRARY cudnn
+    HINTS
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib
+    $ENV{CUDNN_PATH}/lib64
+    $ENV{CUDNN_PATH}/lib/x64
+    $ENV{CUDNN_PATH}/lib
+    $ENV{CUDA_PATH}/lib64
+    $ENV{CUDA_PATH}/lib/x64
+    $ENV{CUDA_PATH}/lib
+    ${CUDNN_PATH}/lib64
+    ${CUDNN_PATH}/lib/x64
+    ${CUDNN_PATH}/lib
+    /usr/lib/x86_64-linux-gnu
+    /usr/lib)
+
+if(_CUDNN_INCLUDE_DIR AND _CUDNN_LIBRARY)
+
+    message(STATUS "cuDNN: ${_CUDNN_LIBRARY}")
+    message(STATUS "cuDNN: ${_CUDNN_INCLUDE_DIR}")
+    
+    set(CUDNN_FOUND ON CACHE INTERNAL "cuDNN Library Found")
+
+else()
+
+    message(STATUS "cuDNN not found.")
+    set(CUDNN_FOUND OFF CACHE INTERNAL "cuDNN Library Found")
+
+endif()
+
+set(CUTLASS_ENABLE_CUDNN ${CUDNN_FOUND} CACHE BOOL "Enable CUTLASS to build with cuDNN library.")
+
+if (CUTLASS_ENABLE_CUDNN AND NOT TARGET cudnn)
+
+  set(CUDNN_INCLUDE_DIR ${_CUDNN_INCLUDE_DIR})
+  set(CUDNN_LIBRARY ${_CUDNN_LIBRARY})
+
+  if(WIN32)
+    add_library(cudnn STATIC IMPORTED GLOBAL)
+  else()
+    add_library(cudnn SHARED IMPORTED GLOBAL)
+  endif()
+
+  add_library(nvidia::cudnn ALIAS cudnn)
+
+  set_property(
+    TARGET cudnn
+    PROPERTY IMPORTED_LOCATION
+    ${CUDNN_LIBRARY})
+    
+  target_include_directories(
+    cudnn
+    INTERFACE
+    $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CUDNN_INCLUDE_DIR}>)
+
+endif()
+
+if(CUTLASS_ENABLE_CUDNN AND NOT CUDNN_FOUND)
+  message(FATAL_ERROR "CUTLASS_ENABLE_CUDNN enabled but cuDNN library could not be found.")
+endif()
+
+message(STATUS "Configuring cuDNN ... done.")
diff --git a/examples/03_visualize_layout/CMakeLists.txt b/examples/03_visualize_layout/CMakeLists.txt
index e2bb283489..27a87c9292 100644
--- a/examples/03_visualize_layout/CMakeLists.txt
+++ b/examples/03_visualize_layout/CMakeLists.txt
@@ -20,9 +20,15 @@
 # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+set(TEST_COMMAND_00 RowMajor --extent=16,16)
+set(TEST_COMMAND_01 "ColumnMajorInterleaved<4>" --extent=32,8 --output-shape=16 --vectorize=4)
+
 cutlass_example_add_executable(
   03_visualize_layout
   visualize_layout.cpp
   register_layout.cu
+  TEST_COMMAND_OPTIONS
+  TEST_COMMAND_00
+  TEST_COMMAND_01
   )
 
diff --git a/examples/03_visualize_layout/visualize_layout.cpp b/examples/03_visualize_layout/visualize_layout.cpp
index a0f2718122..3c4b783ca6 100644
--- a/examples/03_visualize_layout/visualize_layout.cpp
+++ b/examples/03_visualize_layout/visualize_layout.cpp
@@ -32,6 +32,8 @@
 #include <iomanip>
 #include <memory>
 
+#include <cutlass/cutlass.h>
+
 #include "options.h"
 #include "register_layout.h"
 
@@ -133,6 +135,8 @@ int main(int argc, char const *arg[]) {
 
   layout_it->second->print_csv(std::cout);
 
+  cudaFree(0); // Ensure CUDA is available.
+
   return 0;
 }
 
diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
index d18a4e6ab7..36f794d921 100644
--- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
@@ -188,31 +188,6 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
 
 int run() {
 
-  // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available
-  // in CUDA 10.2. 
-  //
-  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
-  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
-    std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
-    return -1;
-  }
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!((props.major * 10 + props.minor) >= 75)) {
-    std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 75."
-              << std::endl;
-
-    // Return 0 so tests are considered passing if run on unsupported platforms.
-    return 0;
-  }
-
   const int length_m = 5120;
   const int length_n = 4096;
   const int length_k = 4096;
@@ -337,18 +312,37 @@ int run() {
 }
 
 int main() {
+  bool notSupported = false;
+
   // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available
   // in CUDA 10.2. 
   //
   // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
   if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
     std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
 
-    // Returning zero so this test passes when built on older Toolkits. 
-    return 0;
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!((props.major * 10 + props.minor) >= 75)) {
+    std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 75."
+              << std::endl;
+
+    notSupported = true;
   }
-  else {
-    return run();
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
   }
+
+  return run();
 }
 
diff --git a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
new file mode 100644
index 0000000000..b1b5c8df1e
--- /dev/null
+++ b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  09_turing_tensorop_conv2dfprop
+  turing_tensorop_conv2dfprop.cu
+  )
+
diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
new file mode 100644
index 0000000000..cf07efdcb5
--- /dev/null
+++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
@@ -0,0 +1,758 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+
+
+This example shows how to run convolution kernels using functions and data structures
+provided by CUTLASS using tensor cores; which we run on a NVIDIA Turing GPU.
+
+Writing a single high performance convolution kernel is hard but do-able. Whereas writing
+high performance kernels at scale which works for multiple problem sizes with good abstractions is
+really hard. CUTLASS solves this problem by providing simplified abstractions to compose
+multiple sections of implicit gemm kernel. When used properly, the kernels can hit peak performance
+of GPU easily.
+
+CUTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
+and thread-block level, they compute on their own tile-size with higher level of tile sizes being
+composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
+to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
+threadblock-tile (tile size computed by a threadblock).
+
+In thie example, we split variable initialization into
+1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel
+can view them (logical to physical mapping)
+2. Setting up computation properties : describes how the above set tensors will be used to compute
+output of convolution.
+
+First, we setup the data types of the input tensor A, weights' tensor B and output tensor C along
+with alpha, beta as the equation for convolution is C = alpha * Conv(A, B) + beta * C. In CUTLASS,
+the kernels first compute Conv(A, B) and leave the rest of the computation to end of the kernel as
+alpha * X + beta * C is a simple element-wise operation on X (Conv(A, B)) and C. We call this as 
+epilogue of kernel. Hence, we setup data types for alpha and beta to be equal to 
+ElementComputeEpilogue = float. We want to use MMA instructions on Turing and they support 4-bit
+signed integer. But int4b_t is not fully supported by Nvidia software stack, so CUTLASS introduces
+cutlass::int4b_t. We use the data type for elements in input tensor A and B as cutlass::int4b_t. We
+convey this to CUTLASS kernel by initializing template variables ElementAccumulator (int32_t),
+ElementComputeEpilogue (float), ElementInputA (cutlass::int4b_t), ElementInputB (cutlass::int4b_t),
+ElementOutput (int32_t). Communicating just the data type is not enough. As the data is laid out 
+linearly in memory, we have to convey the layout of tensors. We do that by initializing template
+variables LayoutInputA, LayoutInputB and LayoutOutput to TensorNHWC cutlass variable. Next, we setup
+rules to comptue alpha * X + beta * C which is called epilogue of the kernel. We initialize template
+variable EpilogueOp, which takes the data type of output ElementOutput (int32_t), the number of
+elements per vector memory access (32), data type of accumulator (int32_t) and data type of
+computation of linear combination (alpha * X + beta * C).
+
+Now that we setup the properties of data, we have to setup properties of computation.
+
+Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128,
+64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
+internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
+data in bank-conflict free manner, and ton of other variables required to compose, intialize and
+launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
+from understanding and coding complicated hardware optimizations which can easily go wrong.
+
+CUTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
+constitute the whole process of loading input data from global memory to shared memory, loading data
+from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
+sequence shows a typical mma pipeline.
+
+tensor in global memory -> registers -> tile in shared memory -> registers -> mma -> registers ->
+output to global memory
+
+The problem with single pipeline is, each stage is synchronous which means, each stage has to wait
+until the previous finished executing. There are stages in the pipeline which do not have fixed
+latency, for example, the loads from global memory and shared memory. Therefore, we can add one more
+pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads.
+Finally, the pipeline in a kernel looks like
+
+(1) tensor in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5)
+mma -> (6) registers -> (7) output to global memory (1) <null> -> (2) <null> -> (3) tensor in global
+memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers ->
+(9) output to global memory
+
+This way, you can hide the second global memory load latency by doing computation on already loaded
+input data.
+
+There are few more template variables initialized such as, which threadblock tile of output matrix
+is done which threadblock launched on an SM, CUDA SM architecture of GPU you want to run on.
+
+These are all put together to create a template variable which describes CUTLASS Implicit GEMM
+kernel using cutlass::conv::device::ImplicitGemm template.
+
+The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
+in the way of learning CUTLASS.
+
+Once all the tensors are initialized and filled with data, create arguments tuple to launch CUTLASS
+kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K = 64,
+R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
+important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
+memory required by the kernel we instantiated. If yes, we create it and pass it along with other
+arguments created to intialize CUTLASS kernel then, the kernel is launched.
+
+In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
+compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output tensors and computation between
+// elements
+using ElementAccumulator = int32_t;                 // Data type of accumulator
+using ElementComputeEpilogue = float;               // Data type of epilogue computation (alpha, beta)
+using ElementInputA = cutlass::int4b_t;             // Data type of elements in input tensor
+using ElementInputB = cutlass::int4b_t;             // Data type of elements in input tensor
+using ElementOutput = cutlass::int4b_t;             // Data type of elements in output tensor
+
+using LayoutInputA = cutlass::layout::TensorNHWC;
+using LayoutInputB = cutlass::layout::TensorNHWC;
+using LayoutOutput = cutlass::layout::TensorNHWC;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm75;
+
+// This code section describes the tile size a thread block will compute
+using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 128>;  // Threadblock tile shape
+
+// This code section describes tile size a warp will compute
+using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;         // Warp tile shape
+
+// This code section describes the size of MMA op
+using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;    // TensorCore instruction shape
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 2;
+
+// This code section describes the epilogue part of the kernel, we use default value
+using EpilogueOp = cutlass::epilogue::thread::LinearCombinationClamp<
+    ElementOutput,                                     // Data type of output matrix.
+    8,                                                 // The number of elements per vectorized.
+                                                       // memory access. This becomes the vector width of
+                                                       // math instructions in the epilogue too.
+    ElementAccumulator,                                // Data type of accumulator
+    ElementComputeEpilogue>;                           // Data type for alpha/beta in linear combination
+
+
+using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAddSaturate,
+  cutlass::conv::IteratorAlgorithm::kAnalytic
+>::Kernel;
+
+using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  cutlass::Tensor4DCoord input_size;
+  cutlass::Tensor4DCoord filter_size;
+  cutlass::Tensor4DCoord padding;
+  cutlass::MatrixCoord conv_stride;
+  cutlass::MatrixCoord dilation;
+  bool reference_check;
+  bool measure_performance;
+  int iterations;
+  bool save_workspace;
+  ElementComputeEpilogue alpha;
+  ElementComputeEpilogue beta;
+  bool benchmark;
+  std::string tag;
+
+  Options():
+    help(false),
+    input_size(1, 32, 32, 32),
+    filter_size(32, 3, 3, 32),
+    padding(1, 1, 1, 1),
+    conv_stride(1, 1),
+    dilation(1, 1),
+    reference_check(false),
+    measure_performance(true),
+    iterations(20),
+    save_workspace(false),
+    alpha(1),
+    beta(0),
+    benchmark(false) { }
+
+  // Verify the problem size is compatible with the CUTLASS Convolution implementation.
+  bool valid() {
+
+    //
+    // CUTLASS attempts to load 128b vectors of int4b_t elements. Consequently,
+    // all pointers, strides, and tensor extents must be divisible by 32 elements.
+    //
+    int const kAlignment = 32;
+
+    if ((input_size.c() % kAlignment) ||
+      (filter_size.n() % kAlignment)) {
+
+      // misaligned tensors
+      return false;
+    }
+
+    // Invalid padding
+    if ((padding.h() != filter_size.h() / 2) ||
+      (padding.w() != filter_size.w() / 2)) {
+
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Updates input and filter sizes
+  void update(
+    cutlass::Tensor4DCoord input_size,
+    cutlass::Tensor4DCoord filter_size) {
+
+    this->input_size = input_size;
+    this->filter_size = filter_size;
+
+    padding.n() = filter_size.h() / 2;
+    padding.h() = filter_size.h() / 2;
+    padding.w() = filter_size.w() / 2;
+    padding.c() = filter_size.w() / 2;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    if (cmd.check_cmd_line_flag("ref-check")) {
+      reference_check = true;
+    }
+
+    if (cmd.check_cmd_line_flag("perf-check")) {
+      measure_performance = true;
+    }
+
+    if (cmd.check_cmd_line_flag("save-workspace")) {
+      save_workspace = true;
+    }
+
+    if (cmd.check_cmd_line_flag("benchmark")) {
+      benchmark = true;
+    }
+
+    cmd.get_cmd_line_argument("n", input_size.n());
+    cmd.get_cmd_line_argument("h", input_size.h());
+    cmd.get_cmd_line_argument("w", input_size.w());
+    cmd.get_cmd_line_argument("c", input_size.c());
+
+    cmd.get_cmd_line_argument("k", filter_size.n());
+    cmd.get_cmd_line_argument("r", filter_size.h());
+    cmd.get_cmd_line_argument("s", filter_size.w());
+    filter_size.c() = input_size.c(); 
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("tag", tag);
+
+    if (filter_size.h() == 3 && filter_size.w() == 3) {
+      padding = {1, 1, 1, 1};
+    }
+    else {
+      filter_size.h() = 1;
+      filter_size.w() = 1;
+      padding = {0, 0, 0, 0};
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "09_turing_tensorop_conv2dfprop example\n\n"
+      << "  This example uses Turing's Tensor Core operators on int4 data types to compute\n"
+      << "  forward convolution on tensors of layout NHWC.\n\n"
+      << "Options:\n\n"
+      << "  --help               If specified, displays this usage statement.\n\n"
+      << "  --n <int>            Input tensor extent N\n"
+      << "  --h <int>            Input tensor extent H\n"
+      << "  --w <int>            Input tensor extent W\n"
+      << "  --c <int>            Input tensor extent C\n"
+      << "  --k <int>            Filter extent K\n"
+      << "  --r <int>            Filter extent R\n"
+      << "  --s <int>            Filter extent S\n\n"
+      << "  --alpha <float>      Epilogue scalar alpha\n"
+      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --ref-check          If set (true), reference check on the host is computed\n"
+      << "  --perf-check         If set (true), performance is measured.\n"
+      << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
+      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --save-workspace     If set, workspace is written to a text file.\n"
+      << "  --tag <string>       String to replicate across the first column in the results table\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
+      << "$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop  --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
+
+    return out;
+  }
+  
+  /// Computes the output tensor size (NPQK)
+  cutlass::Tensor4DCoord output_size() const {
+    return cutlass::Tensor4DCoord(
+      input_size.n(),
+      (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
+      (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
+      filter_size.n());
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of multiply-adds = NPQK * CRS
+    int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c());
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Result {
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cutlass::Status reference_check;
+  cudaError_t error;
+
+  Result(): 
+    runtime_ms(0), 
+    gflops(0),
+    status(cutlass::Status::kSuccess),
+    reference_check(cutlass::Status::kInvalid),
+    error(cudaSuccess) { }
+
+  static std::ostream & print_header(std::ostream &out, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << "Name,";
+    }
+
+    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
+
+    return out;
+  }
+
+  std::ostream & print(std::ostream &out, int idx, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << options.tag << ",";
+    }
+
+    out 
+      << "conv_" << idx << ","
+      << options.input_size.n() << ","
+      << options.input_size.h() << ","
+      << options.input_size.w() << ","
+      << options.input_size.c() << ","
+      << options.filter_size.n() << ","
+      << options.filter_size.h() << ","
+      << options.filter_size.w() << ","
+      << runtime_ms << ","
+      << gflops;
+
+    return out;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Runs one benchmark
+Result profile_convolution(Options const &options) {
+
+  Result result;
+
+  //
+  // Allocate host-device tensors using the CUTLASS Utilities.
+  //
+
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.output_size());
+
+  //
+  // Initialize tensors
+  //
+
+  // Fill tensor A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(7),
+      ElementInputA(-8),
+      0);
+
+  // Fill tensor B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(7),
+      ElementInputB(-8),
+      0);
+
+  // Fill tensor C on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_c.host_view());
+
+  // Fill tensor C for reference on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_c.host_view());
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_ref_c.sync_device();
+
+  //
+  // Define arguments for CUTLASS Convolution
+  //
+
+  // mode (kCrossCorrelation or kConvolution)
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+
+  cutlass::conv::Conv2dProblemSize problem_size(      
+      options.input_size,
+      options.filter_size,
+      options.padding,
+      options.conv_stride,
+      options.dilation,
+      options.output_size(),
+      mode,
+      split_k_slices);
+
+  typename ImplicitGemm::Arguments arguments{
+    problem_size,
+    tensor_a.device_ref(),
+    tensor_b.device_ref(),
+    tensor_c.device_ref(),
+    tensor_c.device_ref(),
+    {options.alpha, options.beta},
+  };
+
+  //
+  // Initialize CUTLASS Convolution
+  //
+
+  ImplicitGemm implicit_gemm_op;
+
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  result.status = implicit_gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Launch initialized CUTLASS kernel
+  //
+  result.status = implicit_gemm_op();
+
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Optional reference check
+  //
+  
+  if (options.reference_check) {
+    std::cout << "Verification on host...\n";
+
+    // Compute with reference implementation
+    cutlass::reference::host::Conv2dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementComputeEpilogue,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<ElementOutput, ElementComputeEpilogue>
+    >(
+      problem_size,
+      tensor_a.host_ref(),
+      tensor_b.host_ref(),
+      tensor_c.host_ref(),
+      tensor_ref_c.host_ref(),
+      options.alpha,
+      options.beta
+    );
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    tensor_c.sync_host();
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_c.host_view(),
+      tensor_ref_c.host_view());
+
+    if (!passed) {
+      result.reference_check = cutlass::Status::kErrorInternal;
+      std::cout << "ERROR - results miscompared.\n";
+    }
+    else {
+      result.reference_check = cutlass::Status::kSuccess;
+      std::cout << "Passed.\n";
+    }
+  }
+  else {
+    result.reference_check = cutlass::Status::kInvalid;
+  }
+
+  if (options.save_workspace) {
+
+    std::stringstream ss;
+
+    ss << "09_tensor_conv_workspace_conv2dfprop_"
+      << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() 
+      << "_"
+      << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() 
+      << ".dat";
+
+    std::ofstream output_workspace(ss.str());
+
+    output_workspace 
+      << "Input = \n" << tensor_a.host_view() << "\n\n"
+      << "Filters = \n" << tensor_b.host_view() << "\n\n";
+
+    if (options.reference_check) {
+      output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n";
+    }
+
+    output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
+
+    std::cout << "Results written to '" << ss.str() << "'." << std::endl;
+  }
+  
+  //
+  // Performance measurement
+  //
+
+  if (options.measure_performance) {
+
+    cudaEvent_t events[2];
+    
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return result;
+      }
+    }
+
+    // Record an event at the start of a series of convolution operations.
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Launch a sequence of implicit GEMM operations on the device
+    for (int iteration = 0; iteration < options.iterations; ++iteration) {
+      result.status = implicit_gemm_op();
+      CUTLASS_CHECK(result.status);
+    }
+
+    // Record an event when the convolutions have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Print average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+  }
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
+  //
+  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+    std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (!(props.major > 7 || (props.major == 7 && props.minor >= 5))) {
+    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
+              << std::endl;
+    return 0;
+  }
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.benchmark) {
+    // Benchmark several layers
+
+    int batch_sizes[] = {1, 32, 64, 128, 256, 512};
+
+    struct Benchmark {
+      int h, w, c, k, r, s;
+    } layers[] = {
+      {56,  56,   64,   256, 1, 1},
+      {56,  56,   64,    64, 1, 1},
+      {56,  56,   64,    64, 3, 3},
+      {56,  56,  256,    64, 1, 1},
+      {56,  56,  256,   512, 1, 1},
+      {56,  56,  256,   128, 1, 1},
+      {28,  28,  128,   128, 3, 3},
+      {28,  28,  128,   512, 1, 1},
+      {28,  28,  512,   128, 1, 1},
+      {28,  28,  512,  1024, 1, 1},
+      {28,  28,  512,   256, 1, 1},
+      {14,  14,  256,   256, 3, 3},
+      {14,  14,  256,  1024, 1, 1},
+      {14,  14,  1024,  256, 1, 1},
+      {14,  14,  1024, 2048, 1, 1},
+      {14,  14,  1024,  512, 1, 1},
+      {7,    7,   512,  512, 3, 3},
+    };
+
+    Result::print_header(std::cout, options) << std::endl;
+
+    int idx = 1;
+
+    for (auto const &layer : layers) {
+      for (auto N : batch_sizes) {
+
+        options.update({N, layer.h, layer.w, layer.c}, {layer.k, layer.r, layer.s, layer.c});
+
+        Result result = profile_convolution(options);
+        result.print(std::cout, idx, options) << std::endl;
+      }
+
+      ++idx;
+    }
+  }
+  else {
+
+    // Execute one problem size
+    if (!options.valid()) {
+      std::cerr << "Invalid problem." << std::endl;
+      return -1;
+    }
+
+    Result result = profile_convolution(options);
+
+    Result::print_header(std::cout, options) << std::endl;
+    result.print(std::cout, 1, options) << std::endl;
+  }
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
index 1f83a61af9..2b5c779bc6 100644
--- a/examples/12_gemm_bias_relu/gemm_bias_relu.cu
+++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
@@ -106,21 +106,6 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
 
 int run() {
 
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!(props.major * 10 + props.minor >= 75)) {
-    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
-              << std::endl;
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    return 0;
-  }
-
   const int length_m = 5120;
   const int length_n = 4096;
   const int length_k = 4096;
@@ -265,17 +250,36 @@ int run() {
 }
 
 int main() {
+
+  bool notSupported = false;
+
   // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
   //
   // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
   if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
     std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major * 10 + props.minor >= 75)) {
+    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
+              << std::endl;
+    notSupported = true;
+  }
 
+  if (notSupported) {
     // Returning zero so this test passes on older Toolkits. Its actions are no-op.
     return 0;
   }
-  else {
-    return run();
-  }
+
+  return run();
 }
   
diff --git a/examples/13_fused_two_gemms/fused_gemm.cu b/examples/13_fused_two_gemms/fused_gemm.cu
index edc08d3189..b96a0ef090 100644
--- a/examples/13_fused_two_gemms/fused_gemm.cu
+++ b/examples/13_fused_two_gemms/fused_gemm.cu
@@ -55,22 +55,6 @@ Performance:
 
 int run() {
 
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!(props.major * 10 + props.minor >= 75)) {
-    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
-              << std::endl;
-
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    return 0;
-  }
-
 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
   run_nonfused_gemm_s8_sm80();
   run_fused_gemm_s8_sm80();
@@ -85,17 +69,38 @@ int run() {
 }
 
 int main() {
+
+  bool notSupported = false;
+
   // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
   //
   // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
   if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
     std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
 
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major * 10 + props.minor >= 75)) {
+    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
+              << std::endl;
+
+    notSupported = true;
+  }
+
+  if (notSupported) {
     // Returning zero so this test passes on older Toolkits. Its actions are no-op.
     return 0;
   }
-  else {
-    return run();
-  }
+
+  return run();
 }
 
diff --git a/examples/13_fused_two_gemms/kernel/b2b_gemm.h b/examples/13_fused_two_gemms/kernel/b2b_gemm.h
index 5df5e4e38d..a67b1e877c 100644
--- a/examples/13_fused_two_gemms/kernel/b2b_gemm.h
+++ b/examples/13_fused_two_gemms/kernel/b2b_gemm.h
@@ -335,7 +335,7 @@ struct B2bGemm {
       semaphore.fetch();
 
       // Indicate which position in a serial reduction the output operator is currently updating
-      output_op_1.set_k_partition(threadblock_tile_offset.k());
+      output_op_1.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
     }
 
     // Tile iterator loading from source tensor.
diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
index 2533557134..84eadc5eab 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
+++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
@@ -113,31 +113,6 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
 
 int run() {
 
-  // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
-  // in CUDA 11.0. 
-  //
-  // CUTLASS must be compiled with CUDA 11 Toolkit to run these examples.
-  if (!(__CUDACC_VER_MAJOR__ >= 11)) {
-    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
-    return -1;
-  }
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!((props.major * 10 + props.minor) >= 80)) {
-    std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 80."
-              << std::endl;
-
-    // Return 0 so tests are considered passing if run on unsupported platforms.
-    return 0;
-  }
-
   const int length_m = 5120;
   const int length_n = 4096;
   const int length_k = 4096;
@@ -262,17 +237,36 @@ int run() {
 }
 
 int main() {
+  
+  bool notSupported = false;
+
   // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
   // in CUDA 11.0. 
   //
   // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
   if (!(__CUDACC_VER_MAJOR__ >= 11)) {
     std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
 
-    // Returning zero so this test passes when built on older Toolkits. 
-    return 0;
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!((props.major * 10 + props.minor) >= 80)) {
+    std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
   }
-  else {
-    return run();
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
   }
+
+  return run();
 }
diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
index 02f65b199e..1b233c488b 100644
--- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
+++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
@@ -71,7 +71,7 @@ using SmArch = cutlass::arch::Sm80;
 
 // This code section describes the tile size a thread block will compute
 using ShapeMMAThreadBlock =
-    cutlass::gemm::GemmShape<256, 128, 256>;  // <- threadblock tile M = 128, N = 128, K = 256
+    cutlass::gemm::GemmShape<128, 128, 256>;  // <- threadblock tile M = 128, N = 128, K = 256
 // This code section describes tile size a warp will compute
 using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 256>;  // <- warp tile M = 64, N = 64, K = 256
 // This code section describes the size of MMA op
@@ -123,31 +123,6 @@ constexpr int kMetaSizeInBits = Gemm::kMetaSizeInBits;
 
 int run() {
 
-  // Ampere Sparse Tensor Core operations exposed with mma.sync and ldmatrix are first available
-  // in CUDA 11.1. 
-  //
-  // CUTLASS must be compiled with CUDA 11.1 Toolkit to run these examples.
-  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))) {
-    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.1 Toolkit or later." << std::endl;
-    return -1;
-  }
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!((props.major * 10 + props.minor) >= 80)) {
-    std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 80."
-              << std::endl;
-
-    // Return 0 so tests are considered passing if run on unsupported platforms.
-    return 0;
-  }
-
   const int length_m = 512;
   const int length_n = 512;
   const int length_k = 1024;
@@ -295,17 +270,37 @@ int run() {
 }
 
 int main() {
+  
+  bool notSupported = false;
+
   // Ampere Sparse Tensor Core operations exposed with mma.sync and ldmatrix are first available
   // in CUDA 11.1. 
   //
   // CUTLASS must be compiled with CUDA 11.1 Toolkit to run these examples.
+  
   if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))) {
     std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.1 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
 
-    // Returning zero so this test passes when built on older Toolkits. 
-    return 0;
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
   }
-  else {
-    return run();
+
+  if (!((props.major * 10 + props.minor) >= 80)) {
+    std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
   }
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  return run();
 }
diff --git a/examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt b/examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt
new file mode 100644
index 0000000000..1b7daac3dc
--- /dev/null
+++ b/examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  22_ampere_tensorop_conv2dfprop
+  ampere_tensorop_conv2dfprop.cu
+  )
+
diff --git a/examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
new file mode 100644
index 0000000000..cb7c398661
--- /dev/null
+++ b/examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
@@ -0,0 +1,763 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+
+This example shows how to run convolution kernels using functions and data structures
+provided by CUTLASS using tensor cores; which we run on a NVIDIA Ampere GPU.
+
+Writing a single high performance convolution kernel is hard but do-able. Whereas writing
+high performance kernels at scale which works for multiple problem sizes with good abstractions is
+really hard. CUTLASS solves this problem by providing simplified abstractions to compose
+multiple sections of implicit gemm kernel. When used properly, the kernels can hit peak performance
+of GPU easily.
+
+CUTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
+and thread-block level, they compute on their own tile-size with higher level of tile sizes being
+composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
+to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
+threadblock-tile (tile size computed by a threadblock).
+
+In thie example, we split variable initialization into
+1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel
+can view them (logical to physical mapping)
+2. Setting up computation properties : describes how the above set tensors will be used to compute
+output of convolution.
+
+First, we setup the data types of the input tensor A, weights' tensor B and output tensor C along
+with alpha, beta as the equation for convolution is C = alpha * Conv2dFprop(A, B) + beta * C. In CUTLASS,
+the kernels first compute Conv2dFprop(A, B) and leave the rest of the computation to end of the kernel as
+alpha * X + beta * C is a simple element-wise operation on X (Conv2dFprop(A, B)) and C. We call this as 
+epilogue of kernel. Hence, we setup data types for alpha and beta to be equal to 
+ElementComputeEpilogue = float. We use the data type for elements in input tensor A and B as 
+cutlass::half_t. We convey this to CUTLASS kernel by initializing template variables ElementAccumulator (float),
+ElementComputeEpilogue (float), ElementInputA (cutlass::half_t), ElementInputB (cutlass::half_t),
+ElementOutput (float). Communicating just the data type is not enough. As the data is laid out 
+linearly in memory, we have to convey the layout of tensors. We do that by initializing template
+variables LayoutInputA, LayoutInputB and LayoutOutput to TensorNHWC cutlass variable. Next, we setup
+rules to comptue alpha * X + beta * C which is called epilogue of the kernel. We initialize template
+variable EpilogueOp, which takes the data type of output ElementOutput (float), the number of
+elements per vector memory access (8), data type of accumulator (float) and data type of
+computation of linear combination (alpha * X + beta * C).
+
+Now that we setup the properties of data, we have to setup properties of computation.
+
+Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x64,
+64x64x64, 16x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
+internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
+data in bank-conflict free manner, and ton of other variables required to compose, intialize and
+launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
+from understanding and coding complicated hardware optimizations which can easily go wrong.
+
+CUTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
+constitute the whole process of loading input data from global memory to shared memory, loading data
+from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
+sequence shows a typical mma multistage pipeline.
+(see include/cutlass/conv/threadblock/implicit_gemm_multistage.h)
+
+tensor in global memory --cp_async--> tile in shared memory --smem loads--> registers 
+--mma--> registers --global stores--> output to global memory
+
+NVIDIA Ampere uses `cp_async` to build multistage software pipeline to better hide latencies.
+
+
+There are few more template variables initialized such as, which threadblock tile of output matrix
+is done which threadblock launched on an SM, CUDA SM architecture of GPU you want to run on.
+
+These are all put together to create a template variable which describes CUTLASS Implicit GEMM
+kernel using cutlass::conv::device::ImplicitGemm template.
+
+The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
+in the way of learning CUTLASS.
+
+Once all the tensors are initialized and filled with data, create arguments tuple to launch CUTLASS
+kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K = 64,
+R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
+important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
+memory required by the kernel we instantiated. If yes, we create it and pass it along with other
+arguments created to intialize CUTLASS kernel then, the kernel is launched.
+
+In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
+compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output tensors and computation between
+// elements 
+using ElementAccumulator = float;                  // Data type of accumulator
+using ElementComputeEpilogue = float;              // Data type of epilogue computation (alpha, beta)
+using ElementInputA = cutlass::half_t;             // Data type of elements in input tensor
+using ElementInputB = cutlass::half_t;             // Data type of elements in input tensor
+using ElementOutput = float;                       // Data type of elements in output tensor
+
+using LayoutInputA = cutlass::layout::TensorNHWC;
+using LayoutInputB = cutlass::layout::TensorNHWC;
+using LayoutOutput = cutlass::layout::TensorNHWC;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 64>;  // Threadblock tile shape
+
+// This code section describes tile size a warp will compute
+using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;         // Warp tile shape
+
+// This code section describes the size of MMA op
+using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;    // TensorCore instruction shape
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 3;
+
+// This code section describe iterator algorithm selected is Analytic or Optimized
+static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kAnalytic;
+
+// This code section describes the epilogue part of the kernel, we use default value
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,                                     // Data type of output matrix.
+    128 / cutlass::sizeof_bits<ElementOutput>::value,  // The number of elements per vectorized.
+                                                       // memory access. This becomes the vector width of
+                                                       // math instructions in the epilogue too.
+    ElementAccumulator,                                // Data type of accumulator
+    ElementComputeEpilogue>;                           // Data type for alpha/beta in linear combination
+
+
+using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAdd,
+  IteratorAlgorithm
+>::Kernel;
+
+using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  cutlass::Tensor4DCoord input_size;
+  cutlass::Tensor4DCoord filter_size;
+  cutlass::Tensor4DCoord padding;
+  cutlass::MatrixCoord conv_stride;
+  cutlass::MatrixCoord dilation;
+  bool reference_check;
+  bool measure_performance;
+  int iterations;
+  bool save_workspace;
+  ElementComputeEpilogue alpha;
+  ElementComputeEpilogue beta;
+  bool benchmark;
+  std::string tag;
+
+  Options():
+    help(false),
+    input_size(1, 32, 32, 32),
+    filter_size(32, 3, 3, 32),
+    padding(1, 1, 1, 1),
+    conv_stride(1, 1),
+    dilation(1, 1),
+    reference_check(false),
+    measure_performance(true),
+    iterations(20),
+    save_workspace(false),
+    alpha(1),
+    beta(0),
+    benchmark(false) { }
+
+  // Verify the problem size is compatible with the CUTLASS Convolution implementation.
+  bool valid() {
+
+    //
+    // CUTLASS attempts to load 128b vectors of cutlass::half_t (F16) elements. Consequently,
+    // all pointers, strides, and tensor extents must be divisible by 8 elements.
+    //
+    int const kAlignment = 8;
+
+    if ((input_size.c() % kAlignment) ||
+      (filter_size.n() % kAlignment)) {
+
+      // misaligned tensors
+      return false;
+    }
+
+    // Invalid padding
+    if ((padding.h() != filter_size.h() / 2) ||
+      (padding.w() != filter_size.w() / 2)) {
+
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Updates input and filter sizes
+  void update(
+    cutlass::Tensor4DCoord input_size,
+    cutlass::Tensor4DCoord filter_size) {
+
+    this->input_size = input_size;
+    this->filter_size = filter_size;
+
+    padding.n() = filter_size.h() / 2;
+    padding.h() = filter_size.h() / 2;
+    padding.w() = filter_size.w() / 2;
+    padding.c() = filter_size.w() / 2;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    if (cmd.check_cmd_line_flag("ref-check")) {
+      reference_check = true;
+    }
+
+    if (cmd.check_cmd_line_flag("perf-check")) {
+      measure_performance = true;
+    }
+
+    if (cmd.check_cmd_line_flag("save-workspace")) {
+      save_workspace = true;
+    }
+
+    if (cmd.check_cmd_line_flag("benchmark")) {
+      benchmark = true;
+    }
+
+    cmd.get_cmd_line_argument("n", input_size.n());
+    cmd.get_cmd_line_argument("h", input_size.h());
+    cmd.get_cmd_line_argument("w", input_size.w());
+    cmd.get_cmd_line_argument("c", input_size.c());
+
+    cmd.get_cmd_line_argument("k", filter_size.n());
+    cmd.get_cmd_line_argument("r", filter_size.h());
+    cmd.get_cmd_line_argument("s", filter_size.w());
+    filter_size.c() = input_size.c(); 
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("tag", tag);
+
+    if (filter_size.h() == 3 && filter_size.w() == 3) {
+      padding = {1, 1, 1, 1};
+    }
+    else {
+      filter_size.h() = 1;
+      filter_size.w() = 1;
+      padding = {0, 0, 0, 0};
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "22_ampere_tensorop_conv2dfprop example\n\n"
+      << "  This example uses Ampere's Tensor Core operators on F16 data types to compute\n"
+      << "  forward convolution on tensors of layout NHWC.\n\n"
+      << "Options:\n\n"
+      << "  --help               If specified, displays this usage statement.\n\n"
+      << "  --n <int>            Input tensor extent N\n"
+      << "  --h <int>            Input tensor extent H\n"
+      << "  --w <int>            Input tensor extent W\n"
+      << "  --c <int>            Input tensor extent C\n"
+      << "  --k <int>            Filter extent K\n"
+      << "  --r <int>            Filter extent R\n"
+      << "  --s <int>            Filter extent S\n\n"
+      << "  --alpha <float>      Epilogue scalar alpha\n"
+      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --ref-check          If set (true), reference check on the host is computed\n"
+      << "  --perf-check         If set (true), performance is measured.\n"
+      << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
+      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --save-workspace     If set, workspace is written to a text file.\n"
+      << "  --tag <string>       String to replicate across the first column in the results table\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/22_ampere_tensorop_conv2dfprop/22_ampere_tensorop_conv2dfprop  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
+      << "$ ./examples/22_ampere_tensorop_conv2dfprop/22_ampere_tensorop_conv2dfprop  --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
+
+    return out;
+  }
+  
+  /// Computes the output tensor size (NPQK)
+  cutlass::Tensor4DCoord output_size() const {
+    return cutlass::Tensor4DCoord(
+      input_size.n(),
+      (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
+      (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
+      filter_size.n());
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of multiply-adds = NPQK * CRS
+    int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c());
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Result {
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cutlass::Status reference_check;
+  cudaError_t error;
+
+  Result(): 
+    runtime_ms(0), 
+    gflops(0),
+    status(cutlass::Status::kSuccess),
+    reference_check(cutlass::Status::kInvalid),
+    error(cudaSuccess) { }
+
+  static std::ostream & print_header(std::ostream &out, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << "Name,";
+    }
+
+    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
+
+    return out;
+  }
+
+  std::ostream & print(std::ostream &out, int idx, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << options.tag << ",";
+    }
+
+    out 
+      << "conv_" << idx << ","
+      << options.input_size.n() << ","
+      << options.input_size.h() << ","
+      << options.input_size.w() << ","
+      << options.input_size.c() << ","
+      << options.filter_size.n() << ","
+      << options.filter_size.h() << ","
+      << options.filter_size.w() << ","
+      << runtime_ms << ","
+      << gflops;
+
+    return out;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Runs one benchmark
+Result profile_convolution(Options const &options) {
+
+  Result result;
+
+  //
+  // Allocate host-device tensors using the CUTLASS Utilities.
+  //
+
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.output_size());
+
+  //
+  // Initialize tensors
+  //
+
+  // Fill tensor A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(7),
+      ElementInputA(-8),
+      0);
+
+  // Fill tensor B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(7),
+      ElementInputB(-8),
+      0);
+
+  // Fill tensor C on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_c.host_view());
+
+  // Fill tensor C for reference on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_c.host_view());
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_ref_c.sync_device();
+
+  //
+  // Define arguments for CUTLASS Convolution
+  //
+
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+
+  typename ImplicitGemm::Arguments arguments{
+    {
+      options.input_size,
+      options.filter_size,
+      options.padding,
+      options.conv_stride,
+      options.dilation,
+      options.output_size(),
+      mode,
+      split_k_slices 
+    },
+    tensor_a.device_ref(),
+    tensor_b.device_ref(),
+    tensor_c.device_ref(),
+    tensor_c.device_ref(),
+    {options.alpha, options.beta},
+
+    
+  };
+
+  //
+  // Initialize CUTLASS Convolution
+  //
+
+  ImplicitGemm implicit_gemm_op;
+
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  result.status = implicit_gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Launch initialized CUTLASS kernel
+  //
+  result.status = implicit_gemm_op();
+
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Optional reference check
+  //
+  
+  if (options.reference_check) {
+    std::cout << "Verification on host...\n";
+
+    cutlass::conv::Conv2dProblemSize problem_size(
+      options.input_size,
+      options.filter_size,
+      options.padding,
+      options.conv_stride,
+      options.dilation,
+      mode
+    );
+
+    // Compute with reference implementation
+    cutlass::reference::host::Conv2dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementComputeEpilogue,
+      ElementAccumulator,
+      cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>
+    >(
+      problem_size,
+      tensor_a.host_ref(),
+      tensor_b.host_ref(),
+      tensor_c.host_ref(),
+      tensor_ref_c.host_ref(),
+      options.alpha,
+      options.beta
+    );
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    tensor_c.sync_host();
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_c.host_view(),
+      tensor_ref_c.host_view());
+
+    if (!passed) {
+      result.reference_check = cutlass::Status::kErrorInternal;
+      std::cout << "ERROR - results miscompared.\n";
+    }
+    else {
+      result.reference_check = cutlass::Status::kSuccess;
+      std::cout << "Passed.\n";
+    }
+  }
+  else {
+    result.reference_check = cutlass::Status::kInvalid;
+  }
+
+  if (options.save_workspace) {
+
+    std::stringstream ss;
+
+    ss << "22_ampere_workspace_conv2dfprop_"
+      << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() 
+      << "_"
+      << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() 
+      << ".dat";
+
+    std::ofstream output_workspace(ss.str());
+
+    output_workspace 
+      << "Input = \n" << tensor_a.host_view() << "\n\n"
+      << "Filters = \n" << tensor_b.host_view() << "\n\n";
+
+    if (options.reference_check) {
+      output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n";
+    }
+
+    output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
+
+    std::cout << "Results written to '" << ss.str() << "'." << std::endl;
+  }
+  
+  //
+  // Performance measurement
+  //
+
+  if (options.measure_performance) {
+
+    cudaEvent_t events[2];
+    
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return result;
+      }
+    }
+
+    // Record an event at the start of a series of convolution operations.
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Launch a sequence of implicit GEMM operations on the device
+    for (int iteration = 0; iteration < options.iterations; ++iteration) {
+      result.status = implicit_gemm_op();
+      CUTLASS_CHECK(result.status);
+    }
+
+    // Record an event when the convolutions have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Print average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+  }
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  bool notSupported = false;
+
+  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
+  //
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+    std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    return 0;
+  }
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.benchmark) {
+    // Benchmark several layers
+
+    int batch_sizes[] = {1, 32, 64, 128, 256, 512};
+
+    struct Benchmark {
+      int h, w, c, k, r, s;
+    } layers[] = {
+      {56,  56,   64,   256, 1, 1},
+      {56,  56,   64,    64, 1, 1},
+      {56,  56,   64,    64, 3, 3},
+      {56,  56,  256,    64, 1, 1},
+      {56,  56,  256,   512, 1, 1},
+      {56,  56,  256,   128, 1, 1},
+      {28,  28,  128,   128, 3, 3},
+      {28,  28,  128,   512, 1, 1},
+      {28,  28,  512,   128, 1, 1},
+      {28,  28,  512,  1024, 1, 1},
+      {28,  28,  512,   256, 1, 1},
+      {14,  14,  256,   256, 3, 3},
+      {14,  14,  256,  1024, 1, 1},
+      {14,  14,  1024,  256, 1, 1},
+      {14,  14,  1024, 2048, 1, 1},
+      {14,  14,  1024,  512, 1, 1},
+      {7,    7,   512,  512, 3, 3},
+    };
+
+    Result::print_header(std::cout, options) << std::endl;
+
+    int idx = 1;
+
+    for (auto const &layer : layers) {
+      for (auto N : batch_sizes) {
+
+        options.update({N, layer.h, layer.w, layer.c}, {layer.k, layer.r, layer.s, layer.c});
+
+        Result result = profile_convolution(options);
+        result.print(std::cout, idx, options) << std::endl;
+      }
+
+      ++idx;
+    }
+  }
+  else {
+
+    // Execute one problem size
+    if (!options.valid()) {
+      std::cerr << "Invalid problem." << std::endl;
+      return -1;
+    }
+
+    Result result = profile_convolution(options);
+
+    Result::print_header(std::cout, options) << std::endl;
+    result.print(std::cout, 1, options) << std::endl;
+  }
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index aabfa53c62..d51df92c70 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -22,15 +22,20 @@
 
 set(CUTLASS_EXAMPLES_COMMON_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/common)
 
+add_custom_target(cutlass_examples)
+add_custom_target(test_examples)
+
 function(cutlass_example_add_executable NAME)
 
   set(options)
   set(oneValueArgs)
-  set(multiValueArgs)
+  set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS})
 
+  add_dependencies(cutlass_examples ${NAME})
+
   target_link_libraries(
     ${NAME}
     PRIVATE
@@ -44,18 +49,20 @@ function(cutlass_example_add_executable NAME)
     ${CUTLASS_EXAMPLES_COMMON_SOURCE_DIR}
     )
 
-  add_custom_target(
-    test_${NAME}
-    COMMAND
-    ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${NAME}>
-  DEPENDS
-    ${NAME}
+  install(
+    TARGETS ${NAME}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
     )
 
-endfunction()
+  cutlass_add_executable_tests(
+    test_examples_${NAME} ${NAME}
+    DEPENDS ${__DEPENDS}
+    DEPENDEES test_examples ${__DEPENDEES}
+    TEST_COMMAND_OPTIONS ${__TEST_COMMAND_OPTIONS}
+    DISABLE_EXECUTABLE_INSTALL_RULE
+    )
 
-add_custom_target(cutlass_examples)
-add_custom_target(test_examples)
+endfunction()
 
 foreach(EXAMPLE
   00_basic_gemm
@@ -67,16 +74,16 @@ foreach(EXAMPLE
   06_splitK_gemm
   07_volta_tensorop_gemm
   08_turing_tensorop_gemm
+  09_turing_tensorop_conv2dfprop
   10_planar_complex
   11_planar_complex_array
   12_gemm_bias_relu
   13_fused_two_gemms
   14_ampere_tf32_tensorop_gemm
   15_ampere_sparse_tensorop_gemm
-)
+  22_ampere_tensorop_conv2dfprop
+  )
 
   add_subdirectory(${EXAMPLE})
-  add_dependencies(cutlass_examples ${EXAMPLE})
-  add_dependencies(test_examples test_${EXAMPLE})
 
 endforeach()
diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h
index 04c568760e..045196cb8f 100644
--- a/include/cutlass/arch/memory_sm80.h
+++ b/include/cutlass/arch/memory_sm80.h
@@ -74,6 +74,10 @@ template <
     /// Size of the access in bytes
     int SizeInBytes>
 struct cp_async<SizeInBytes, CacheOperation::Always> {
+  // Make sure the size is supported.
+  static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
+                "Size is not supported");
+
   /// Copy
   CUTLASS_DEVICE
   cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
@@ -104,6 +108,10 @@ template <
     /// Size of the access in bytes
     int SizeInBytes>
 struct cp_async_zfill<SizeInBytes, CacheOperation::Always> {
+  // Make sure the size is supported.
+  static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
+                "Size is not supported");
+
   /// Copy with zero fill
   CUTLASS_DEVICE
   cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
@@ -138,6 +146,10 @@ template <
     /// Size of the access in bytes
     int SizeInBytes>
 struct cp_async<SizeInBytes, CacheOperation::Global> {
+  // Make sure the size is supported.
+  static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
+                "Size is not supported");
+
   /// Copy
   CUTLASS_DEVICE
   cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
@@ -171,6 +183,10 @@ template <
     /// Size of the access in bytes
     int SizeInBytes>
 struct cp_async_zfill<SizeInBytes, CacheOperation::Global> {
+  // Make sure the size is supported.
+  static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
+                "Size is not supported");
+
   /// Copy with zero fill
   CUTLASS_DEVICE
   cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
@@ -235,4 +251,3 @@ CUTLASS_DEVICE void cp_async_wait<0>() {
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h
index 49f3979cab..729cd17917 100644
--- a/include/cutlass/arch/mma.h
+++ b/include/cutlass/arch/mma.h
@@ -201,5 +201,5 @@ struct SparseMma;
 #include "cutlass/arch/mma_sm70.h" 
 #include "cutlass/arch/mma_sm75.h" 
 #include "cutlass/arch/mma_sm80.h"
-#include "cutlass/arch/sp_mma_sm80.h"
+#include "cutlass/arch/mma_sparse_sm80.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h
index a862e65df5..c5e0db9720 100644
--- a/include/cutlass/arch/mma_sm75.h
+++ b/include/cutlass/arch/mma_sm75.h
@@ -365,7 +365,7 @@ struct Mma<
   }
 };
 
-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 16>,
@@ -599,7 +599,7 @@ struct Mma<
   }
 };
 
-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
 template <>
 struct Mma<
   gemm::GemmShape<8,8,16>,
diff --git a/include/cutlass/arch/sp_mma_sm80.h b/include/cutlass/arch/mma_sparse_sm80.h
similarity index 99%
rename from include/cutlass/arch/sp_mma_sm80.h
rename to include/cutlass/arch/mma_sparse_sm80.h
index 0c8989b86a..a93fd2924c 100644
--- a/include/cutlass/arch/sp_mma_sm80.h
+++ b/include/cutlass/arch/mma_sparse_sm80.h
@@ -29,7 +29,15 @@
 
 #pragma once
 
-#include "mma_sm80.h"
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/arch/wmma.h b/include/cutlass/arch/wmma.h
index 88968abdc5..0a556aee3a 100644
--- a/include/cutlass/arch/wmma.h
+++ b/include/cutlass/arch/wmma.h
@@ -52,7 +52,7 @@
 #endif
 #endif
 
-#endif //__clang__
+#endif //!defined(__clang__)
 
 #if defined(CUTLASS_ARCH_WMMA_ENABLED)
 
@@ -82,6 +82,12 @@ struct CutlassToWmmaDataType<cutlass::half_t> {
   using Type = __half;
 };
 
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+template<>
+struct CutlassToWmmaDataType<cutlass::bfloat16_t> {
+  using Type = __nv_bfloat16;
+};
+#endif
 
 /// Statically maps int8_t => char
 template<>
@@ -158,6 +164,14 @@ template<>
 struct WmmaToCutlassDataType<__half> {
   using Type = cutlass::half_t;
 };
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+template<>
+struct WmmaToCutlassDataType<__nv_bfloat16> {
+  using Type = cutlass::bfloat16_t;
+};
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/conv2d_problem_size.h b/include/cutlass/conv/conv2d_problem_size.h
new file mode 100644
index 0000000000..735103722d
--- /dev/null
+++ b/include/cutlass/conv/conv2d_problem_size.h
@@ -0,0 +1,450 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem sizes.
+
+  Conv2dProblem desciption:
+    activation (NHWC), 
+    filter (KRSC), 
+    output (NPQK), 
+    pading (pad_h, pad_w), 
+    stride (stride_h, stride_w),
+    dilation (dilation_h, dilation_w).
+    
+  Free functions to map:
+    Map tensor extents (Conv2d -> ImplicitGemm)      : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator)
+    Map tensor sizes (Conv2d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
+    Map tensor problem sizes (Conv2d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/conv/convolution.h"
+
+namespace cutlass {
+namespace conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Problem size structure
+struct Conv2dProblemSize {
+
+  // Conv2d strictly problem size parameters
+  int N, H, W, C, P, Q, K, R, S;
+  int pad_h, pad_w;
+  int stride_h, stride_w;
+  int dilation_h, dilation_w;
+  Mode mode;
+
+  // Conv2d implementation-related parameters 
+  int split_k_slices;
+  int groups;
+
+  //
+  // Methods
+  //
+
+public:
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(): 
+    N(0), H(0), W(0), C(0), P(0), Q(0), K(0), R(0), S(0),
+    pad_h(0), pad_w(0), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
+    mode(Mode::kConvolution), split_k_slices(1), groups(1) { }
+ 
+  /// Constructor for default padding, stride, dilation, and split-K
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    int N,
+    int H,
+    int W,
+    int C,
+    int P,
+    int Q,
+    int K,
+    int R,
+    int S,
+    Mode mode
+  ): 
+    N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
+    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1), 
+    mode(mode), split_k_slices(1), groups (1) { }
+  
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    int N,
+    int H,
+    int W,
+    int C,
+    int K,
+    int R,
+    int S,
+    int P,
+    int Q,
+    int pad_h,
+    int pad_w,
+    int stride_h,
+    int stride_w,
+    int dilation_h,
+    int dilation_w,
+    Mode mode,
+    int split_k_slices = 1,
+    int groups = 1
+  ): 
+    N(N), H(H), W(W), C(C), K(K), R(R), S(S), P(P), Q(Q),
+    pad_h(pad_h), pad_w(pad_w), stride_h(stride_h), stride_w(stride_w), 
+    dilation_h(dilation_h), dilation_w(dilation_w), 
+    mode(mode), split_k_slices(split_k_slices), groups (groups) { }
+
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
+  // set user-defined output size and sets P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,    // NHWC
+    cutlass::Tensor4DCoord filter_size,   // KRSC
+    cutlass::Tensor4DCoord padding,       // pad_h, _, pad_w, _
+    cutlass::MatrixCoord stride,          // stride_h, stride_w
+    cutlass::MatrixCoord dilation,        // dilation_h, dilation_w
+    cutlass::Tensor4DCoord output_size,   // NPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    pad_h(padding[0]), pad_w(padding[2]), 
+    stride_h(stride.row()), stride_w(stride.column()), 
+    dilation_h(dilation.row()), dilation_w(dilation.column()),
+    P(output_size.h()), Q(output_size.w()),     
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {}
+
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
+  // computes output size and sets P and Q (skip output from ctor arguments)
+  CUTLASS_HOST_DEVICE  
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,   // NHWC
+    cutlass::Tensor4DCoord filter_size,  // KRSC
+    cutlass::Tensor4DCoord padding,      // pad_h, _, pad_w, _
+    cutlass::MatrixCoord stride,         // stride_h, stride_w
+    cutlass::MatrixCoord dilation,       // dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    pad_h(padding[0]), pad_w(padding[2]),
+    stride_h(stride.row()), stride_w(stride.column()), 
+    dilation_h(dilation.row()), dilation_w(dilation.column()),
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {
+      // set output P and Q
+      P = ((H + pad_h * 2 - R * dilation_h) / stride_h) + 1;
+      Q = ((W + pad_w * 2 - S * dilation_w) / stride_w) + 1;
+    }
+
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
+  // set user-defined output size and sets P and Q (skip padding, striding, and dilation)
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,    // NHWC
+    cutlass::Tensor4DCoord filter_size,   // KRSC
+    cutlass::Tensor4DCoord output_size,   // NPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    P(output_size.h()), Q(output_size.w()), 
+    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), 
+    dilation_h(1), dilation_w(1),
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {}
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize reset_mode(cutlass::conv::Mode mode_) {
+    Conv2dProblemSize tmp(*this);
+    tmp.mode = mode_; 
+    return tmp; 
+  }
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize reset_split_k_slices(int split_k_slices_) {
+    Conv2dProblemSize tmp(*this);
+    tmp.split_k_slices = split_k_slices_; 
+    return tmp; 
+  }
+
+  /// Equality operator (ignores mode and split_k_slice)
+  CUTLASS_HOST_DEVICE
+  bool operator==(Conv2dProblemSize const &conv) const {
+    return (
+      (N == conv.N) && (W == conv.H) && (W == conv.W) && (C == conv.C) &&
+      (K == conv.K) && (R == conv.R) && (S == conv.S) &&
+      (P == conv.P) && (Q == conv.Q) &&
+      (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
+      (stride_h == conv.stride_h) && (stride_w == conv.stride_w) &&
+      (dilation_h == conv.dilation_h) && (dilation_h == conv.dilation_h)
+    );  
+  }
+
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(Conv2dProblemSize const &rhs) const {
+    return !(*this == rhs);
+  }
+
+  /// Returns activation extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord activation_extent() const {
+
+    return cutlass::Tensor4DCoord ({N, H, W, C});
+  }
+
+  /// Returns filter extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord filter_extent() const {
+
+    return cutlass::Tensor4DCoord ({K, R, S, C});
+  }
+
+  /// Returns output extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord output_extent() const {
+
+    return cutlass::Tensor4DCoord ({N, P, Q, K});
+  }
+
+  /// Returns activation size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t activation_size() const {
+
+    return (N * H * W * C);
+  }
+
+  /// Returns filter size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t filter_size() const {
+
+    return (K * R * S * C);
+  }
+
+  /// Returns output size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t output_size() const {
+
+    return (N * P * Q * K);
+  }
+  
+  /// Returns output extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord padding() const {
+
+    return cutlass::Tensor4DCoord ({pad_h, pad_h, pad_w, pad_w});
+  }
+
+  /// Returns stride as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::MatrixCoord stride() const {
+
+    return cutlass::MatrixCoord ({stride_h, stride_w});
+  }
+
+  /// Returns dilation as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::MatrixCoord dilation() const {
+
+    return cutlass::MatrixCoord ({dilation_h, dilation_w});
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                  ImplicitGemm helper functions                                 //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Determine the problem size of the implicit GEMM operation
+CUTLASS_HOST_DEVICE
+cutlass::gemm::GemmCoord implicit_gemm_problem_size(
+  Operator conv_operator, 
+  Conv2dProblemSize const &problem_size) {
+  // Compute problem size
+  switch (conv_operator) {
+  case Operator::kFprop:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.P * problem_size.Q,
+      problem_size.K,
+      problem_size.R * problem_size.S * problem_size.C
+    );
+  case Operator::kDgrad:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.H * problem_size.W,
+      problem_size.C,
+      problem_size.R * problem_size.S * problem_size.K
+    );
+  case Operator::kWgrad:
+    return gemm::GemmCoord(
+      problem_size.K,
+      problem_size.R * problem_size.S * problem_size.C,
+      problem_size.N * problem_size.P * problem_size.Q
+    );
+  default:
+    break;
+  }
+  return gemm::GemmCoord();
+}
+
+// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations(
+  Operator conv_operator, 
+  int threadblock_K, 
+  Conv2dProblemSize const &problem_size) {
+
+  int iterations = 0;
+  int elements_per_split_k_slice = 0;
+
+  switch (conv_operator) {
+  case Operator::kFprop:
+    elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+    iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+    break;
+
+  case Operator::kDgrad:
+    elements_per_split_k_slice = (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+    iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+    break;
+
+  case Operator::kWgrad:
+    elements_per_split_k_slice = (problem_size.N * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+    iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
+    break;
+
+  default:
+    break;
+  }
+
+  return iterations;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//  Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output)
+////////////////////////////////////////////////////////////////////////////////
+/// Returns ImplicitGemm tensor A extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_a_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+
+/// Returns ImplicitGemm tensor B extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_b_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+
+/// Returns ImplicitGemm tensor C extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_c_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+
+/// Returns ImplicitGemm tensor A size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_a_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor B size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_b_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor C size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_c_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_size();
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
+    default : break;
+  }
+  return 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace conv
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/conv3d_problem_size.h b/include/cutlass/conv/conv3d_problem_size.h
new file mode 100644
index 0000000000..91827d2724
--- /dev/null
+++ b/include/cutlass/conv/conv3d_problem_size.h
@@ -0,0 +1,453 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem sizes.
+
+  Conv3dProblem desciption:
+    activation (NDHWC), 
+    filter (KTRSC), 
+    output (NZPQK), 
+    pading (pad_d, pad_h, pad_w), 
+    stride (stride_d, stride_h, stride_w), 
+    dilation (dilation_d, dilation_h, dilation_w).
+  
+  Free functions to map:
+    Map tensor extents (Conv3d -> ImplicitGemm)      : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator)
+    Map tensor sizes (Conv3d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
+    Map tensor problem sizes (Conv3d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)  
+*/
+
+#pragma once
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+namespace cutlass {
+namespace conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Problem size structure
+struct Conv3dProblemSize : public Conv2dProblemSize {
+  //
+  // Type definitions
+  //
+
+  // 3D coordinate for padding, stride, and dilation in (d, h, w) dimensions
+  using Coord3D = Coord<3>;
+
+  //
+  // Data members
+  //
+
+  // Conv3d strictly problem size parameters
+  int D, T, Z;    // input depth, filter depth, output depth
+  int pad_d;      // padding in depth dimension
+  int stride_d;   // stride in depth dimension
+  int dilation_d; // dilation in depth dimension
+
+  //
+  // Methods
+  //
+public:
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(): 
+    D(0), T(0), Z(0), 
+    pad_d(0), 
+    stride_d(1), 
+    dilation_d(1),
+    Conv2dProblemSize() { }
+ 
+  /// Constructor for default padding, stride, dilation, and split-K
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    int N,
+    int D,
+    int H,
+    int W,
+    int C,
+    int Z,
+    int P,
+    int Q,
+    int K,
+    int T,
+    int R,
+    int S,
+    Mode mode
+  ): 
+    D(D), T(T), Z(Z), 
+    pad_d(T / 2), stride_d(1), dilation_d(1),
+    Conv2dProblemSize(N, H, W, C, P, Q, K, R, S, mode) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    int N,
+    int D,
+    int H,
+    int W,
+    int C,
+    int K,
+    int T,
+    int R,
+    int S,
+    int Z,
+    int P,
+    int Q,
+    int pad_d,
+    int pad_h,
+    int pad_w,
+    int stride_d,
+    int stride_h,
+    int stride_w,
+    int dilation_d,
+    int dilation_h,
+    int dilation_w,
+    Mode mode,
+    int split_k_slices = 1,
+    int groups = 1
+  ): 
+    D(D), T(T), Z(Z), 
+    pad_d(pad_d), stride_d(stride_d), dilation_d(dilation_d),
+    Conv2dProblemSize(
+      N, H, W, C, K, R, S, P, Q, 
+      pad_h, pad_w, 
+      stride_h, stride_w, 
+      dilation_h, dilation_w,
+      mode, split_k_slices, groups) { }
+
+  /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
+  // set *user-defined* output size and sets Z, P, and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    Coord3D padding,                      // pad_d, pad_h, pad_w
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::Tensor5DCoord output_size,   // NZPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    D(input_size.d()), T(filter_size.d()), Z(output_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]),
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {padding[1], padding[1], padding[2], padding[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      {output_size.n(), output_size.h(), output_size.w(), output_size.c()},
+      mode, split_k_slices, groups
+    ) { }
+
+  /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
+  // *computes* output size and sets Z, P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    Coord3D padding,                      // pad_d, pad_h, pad_w
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    D(input_size.d()), T(filter_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]),
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {padding[1], padding[1], padding[2], padding[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      mode, split_k_slices, groups
+    ) { 
+      // set output Z
+      Z = ((D + pad_d - T * dilation_d) / stride_d) + 1;      
+    }
+
+  /// Equality operator (ignores mode and split_k_slice)
+  CUTLASS_HOST_DEVICE
+  bool operator==(Conv3dProblemSize const &conv) const {
+    return (
+      (N == conv.N) && (D == conv.D) && (H == conv.H) && (W == conv.W) && (C == conv.C) &&
+      (K == conv.K) && (T == conv.T) && (R == conv.R) && (S == conv.S) &&
+      (Z == conv.Z) &&(P == conv.P) && (Q == conv.Q) &&
+      (pad_d == conv.pad_d) && (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
+      (stride_d == conv.stride_d) && (stride_h == conv.stride_h) && (stride_w == conv.stride_h) &&
+      (dilation_d == conv.dilation_d) && (dilation_h == conv.dilation_h) && (dilation_h == conv.dilation_h)
+    );  
+  }
+
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(Conv3dProblemSize const &rhs) const {
+    return !(*this == rhs);
+  }
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize reset_mode(cutlass::conv::Mode mode_) {
+    Conv3dProblemSize tmp(*this);
+    tmp.mode = mode_; 
+    return tmp; 
+  }
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize reset_split_k_slices(int split_k_slices_) {
+    Conv3dProblemSize tmp(*this);
+    tmp.split_k_slices = split_k_slices_; 
+    return tmp; 
+  }
+  
+  /// Returns activation extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord activation_extent() const {
+
+    return cutlass::Tensor5DCoord ({N, D, H, W, C});
+  }
+
+  /// Returns filter extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord filter_extent() const {
+
+    return cutlass::Tensor5DCoord ({K, T, R, S, C});
+  }
+
+  /// Returns output extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord output_extent() const {
+
+    return cutlass::Tensor5DCoord ({N, Z, P, Q, K});
+  }
+
+  /// Returns activation size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t activation_size() const {
+
+    return (N * D * H * W * C);
+  }
+
+  /// Returns filter size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t filter_size() const {
+
+    return (K * T * R * S * C);
+  }
+
+  /// Returns output size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t output_size() const {
+
+    return (N * Z * P * Q * K);
+  }
+
+  /// Returns output extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  Coord3D padding() const {
+
+    return Coord3D ({pad_d, pad_h, pad_w});
+  }
+
+  /// Returns stride as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  Coord3D stride() const {
+
+    return Coord3D ({stride_d, stride_h, stride_w});
+  }
+
+  /// Returns dilation as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  Coord3D dilation() const {
+
+    return Coord3D ({dilation_d, dilation_h, dilation_w});
+  }
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                  ImplicitGemm helper functions                                 //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Determine the problem size of the implicit GEMM operation
+CUTLASS_HOST_DEVICE
+cutlass::gemm::GemmCoord implicit_gemm_problem_size(
+  Operator conv_operator, 
+  Conv3dProblemSize const &problem_size) {
+  // Compute problem size
+  switch (conv_operator) {
+  case Operator::kFprop:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.Z * problem_size.P * problem_size.Q,
+      problem_size.K,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.C
+    );
+  case Operator::kDgrad:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.D * problem_size.H * problem_size.W,
+      problem_size.C,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.K
+    );
+  case Operator::kWgrad:
+    return gemm::GemmCoord(
+      problem_size.K,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.C,
+      problem_size.N * problem_size.Z * problem_size.P * problem_size.Q
+    );
+  default:
+    break;
+  }
+  return gemm::GemmCoord();
+}
+
+// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations(
+  Operator conv_operator, 
+  int threadblock_K, 
+  Conv3dProblemSize const &problem_size) {
+
+  int iterations = 0;
+  int elements_per_split_k_slice = 0;
+
+  switch (conv_operator) {
+    case Operator::kFprop:
+      elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+      iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+      break;
+  
+    case Operator::kDgrad:
+      elements_per_split_k_slice =  (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+      iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+      break;
+  
+    case Operator::kWgrad:
+      elements_per_split_k_slice = (problem_size.N * problem_size.Z * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+      iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
+      break;
+  
+    default:
+      break;
+  }
+
+  return iterations;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//  Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output)
+////////////////////////////////////////////////////////////////////////////////
+/// Returns ImplicitGemm tensor A extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_a_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+
+/// Returns ImplicitGemm tensor B extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_b_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+
+/// Returns ImplicitGemm tensor C extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_c_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+
+/// Returns ImplicitGemm tensor A size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_a_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor B size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_b_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor C size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_c_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_size();
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
+    default : break;
+  }
+  return 0;
+}
+
+} // namespace conv
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/convolution.h b/include/cutlass/conv/convolution.h
new file mode 100644
index 0000000000..c743ea6faa
--- /dev/null
+++ b/include/cutlass/conv/convolution.h
@@ -0,0 +1,118 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+
+This file contains definitions and utility functions for describing convolution problem sizes in terms of 
+activation (NHWC), filter (KRSC), output (NPQK), pading (pad_h, pad_w), stride (stride_h, stride_w),
+dilation (dilation_h, dilation_w).  Furthermore, it defines helper functions to map cutlass' implicit gemm 
+tensor extents, sizes, data types to that of convolutions extents, sizes, and data types. 
+
+                        * Mapping convolutions to Gemm computation *
+
+Cutlass employs ImplicitGemm algorithm to implement convolutions. ImplicitGemm algorithm runs gemm operation 
+on convolution tensors Activation, Filter, and Output . The underlying gemm operation follows the standard 
+gemm definition:
+
+                                     C = A * B + C
+
+                               A and B are input matrices
+                            C is source and output matrix
+
+
+For the three convolutional operators (Fprop, Dgrad, Wgrad), ImplicitGemm matrices A, B, and C are mapped on 
+to convolution tensors Activation, Filter and Output as per the below table:
+
+        ___________________________________________________________________________
+         ConvolutionalOperator |        A        |      B         |       C                           
+        ___________________________________________________________________________
+        |                      |                 |                |               |
+        |       Fprop          |    Activation   |    Filter      |     Output    |  
+        |       Dgrad          |     Output      |    Filter      |   Activation  |  
+        |       Wgrad          |     Output      |  Activation    |     Filter    | 
+        ___________________________________________________________________________
+
+In convolution codebase, DO NOT mix using (A, B, C) with (Acvitation, Filter, Output).
+
+For example, a convolution class/function with A, B, Output is confusing and error-prone. Instead use below 
+mapping functions and adhere to using either A, B, C or Acvitation, Filter, Output. 
+
+Map elements' data types (ImplicitGemm -> Conv): GemmToConvElementMap
+Map elements' data types (Conv -> ImplicitGemm): ConvToGemmElementMap
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+namespace cutlass {
+namespace conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Convolutional operator
+enum class Operator { 
+  kFprop, 
+  kDgrad, 
+  kWgrad 
+};
+
+/// Distinguishes convolution  from cross correlation
+enum class Mode { 
+  kCrossCorrelation, 
+  kConvolution 
+};
+
+/// Selects among several implementation variants trading off performance with simplicity
+enum class IteratorAlgorithm { 
+  kAnalytic,      ///< functionally correct in all cases but lower performance
+  kOptimized      ///< optimized for R <= 32, S <= 32 and unity-stride dgrad
+};
+
+/// Distinguishes among partial specializations that accelerate certain problems where convolution
+/// stride is unit.
+enum class StrideSupport {
+  kStrided,       ///< arbitrary convolution stride
+  kUnity          ///< unit convolution stride
+};
+
+/// Identifies split-K mode
+enum class SplitKMode { 
+  kNone, 
+  kSerial, 
+  kParallel
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace conv
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/device/implicit_gemm_convolution.h b/include/cutlass/conv/device/implicit_gemm_convolution.h
new file mode 100644
index 0000000000..0aa03d1997
--- /dev/null
+++ b/include/cutlass/conv/device/implicit_gemm_convolution.h
@@ -0,0 +1,263 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level Implicit GEMM Convolution
+*/
+
+#pragma once
+
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename ImplicitGemmKernel_>
+class ImplicitGemmConvolution {
+public:
+
+  using ImplicitGemmKernel = ImplicitGemmKernel_;
+
+  using ElementA = typename ImplicitGemmKernel::ElementA;
+  using LayoutA = typename ImplicitGemmKernel::LayoutA;
+  using ElementB = typename ImplicitGemmKernel::ElementB;
+  using LayoutB = typename ImplicitGemmKernel::LayoutB;
+  using ElementC = typename ImplicitGemmKernel::ElementC;
+  using LayoutC = typename ImplicitGemmKernel::LayoutC;
+  using ElementAccumulator = typename ImplicitGemmKernel::ElementAccumulator;
+  using ElementCompute = typename ImplicitGemmKernel::ElementCompute;
+  using OperatorClass = typename ImplicitGemmKernel::OperatorClass;
+  using ArchTag = typename ImplicitGemmKernel::ArchTag;
+  using ThreadblockShape = typename ImplicitGemmKernel::ThreadblockShape;
+  using WarpShape = typename ImplicitGemmKernel::WarpShape;
+  using InstructionShape = typename ImplicitGemmKernel::InstructionShape;
+  using ThreadblockSwizzle = typename ImplicitGemmKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename ImplicitGemmKernel::EpilogueOutputOp;
+  static int const kStages = ImplicitGemmKernel::kStages;
+  static int const kConvDim = ImplicitGemmKernel::kConvDim;
+  using WarpMmaOperator = typename ImplicitGemmKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename ImplicitGemmKernel::ArchMmaOperator;
+  using MathOperator = typename ImplicitGemmKernel::MathOperator; 
+
+  static cutlass::conv::Operator const kConvolutionalOperator = ImplicitGemmKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = ImplicitGemmKernel::kIteratorAlgorithm;
+
+  static int const kWarpCount = 
+    (ThreadblockShape::kM / WarpShape::kM) * 
+    (ThreadblockShape::kN / WarpShape::kN);
+
+  /// Argument structure
+  using Arguments = typename ImplicitGemmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename ImplicitGemmKernel::Params params_;
+
+public:
+
+  /// Constructs Implicit GEMM
+  ImplicitGemmConvolution() { }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    // dispatch to iterators
+    Status status = ImplicitGemmKernel::Mma::IteratorA::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = ImplicitGemmKernel::Mma::IteratorB::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size),
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices));
+
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+  
+    size_t workspace_bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size),
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+    if(args.split_k_mode == SplitKMode::kParallel) {
+
+      // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
+      // The user needs to call a reduction operator to optain the final output tensor
+      workspace_bytes = 
+        sizeof(ElementAccumulator) *
+        size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size)) *
+        size_t(grid_tiled_shape.k());
+    }
+
+    else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size.split_k_slices > 1) {
+
+      // Split-K serial: The user workspace is used to store semaphore and serialize writing the 
+      // final reduced output to user's output tensor
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    return workspace_bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+   
+    if (args.problem_size.split_k_slices > 1) {
+
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+
+      cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
+
+      if (status != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    // initialize the params structure from the arguments
+    params_ = typename ImplicitGemmKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+    
+    int smem_size = int(sizeof(typename ImplicitGemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<ImplicitGemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+
+      result = cudaFuncSetAttribute(
+          cutlass::Kernel<ImplicitGemmKernel>,
+          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    
+    return Status::kSuccess;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.ptr_A = args.ref_A.data();
+    params_.ptr_B = args.ref_B.data();
+    params_.ptr_C = args.ref_C.data();
+    params_.ptr_D = args.ref_D.data();
+    params_.output_op = args.output_op;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+
+    int smem_size = int(sizeof(typename ImplicitGemmKernel::SharedStorage));
+
+    cutlass::Kernel<ImplicitGemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/default_conv2d.h b/include/cutlass/conv/kernel/default_conv2d.h
new file mode 100644
index 0000000000..57fae79655
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_conv2d.h
@@ -0,0 +1,104 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level implicit GEMM convolution definitions for threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+#include "cutlass/conv/threadblock/implicit_gemm_pipelined.h"
+#include "cutlass/conv/threadblock/implicit_gemm_multistage.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogue {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    1,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogue<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  OutputOp
+> {
+
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    1,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/kernel/default_conv2d_dgrad.h b/include/cutlass/conv/kernel/default_conv2d_dgrad.h
new file mode 100644
index 0000000000..c590f57efc
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_conv2d_dgrad.h
@@ -0,0 +1,1154 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv2dDgrad;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                               OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided and
+// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided
+// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Unity Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Unity
+// 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialzation for optimized IteratorAlgorithm Dgrad Unity Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialzation for Optimized IteratorAlgorithm Dgrad Unity
+// 2 stage pipeline
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialzation for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialzation for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop.h b/include/cutlass/conv/kernel/default_conv2d_fprop.h
new file mode 100644
index 0000000000..c38d5150b1
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop.h
@@ -0,0 +1,1379 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+namespace conv {
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv2dFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage 
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and 
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    Stages, MathOperatorTag
+  >;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand 
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and 
+// multistage pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+    Stages, MathOperatorTag, true
+  >;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand 
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/kernel/default_conv2d_wgrad.h b/include/cutlass/conv/kernel/default_conv2d_wgrad.h
new file mode 100644
index 0000000000..c7912203a4
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_conv2d_wgrad.h
@@ -0,0 +1,928 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv2dWgrad;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          OpClassTensorOp convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/kernel/default_conv3d_dgrad.h b/include/cutlass/conv/kernel/default_conv3d_dgrad.h
new file mode 100644
index 0000000000..a92b4bfb6a
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_conv3d_dgrad.h
@@ -0,0 +1,184 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv3dDgrad;
+
+/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/kernel/default_conv3d_fprop.h b/include/cutlass/conv/kernel/default_conv3d_fprop.h
new file mode 100644
index 0000000000..7694c8b9e8
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_conv3d_fprop.h
@@ -0,0 +1,181 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.    
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv3dFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/kernel/default_conv3d_wgrad.h b/include/cutlass/conv/kernel/default_conv3d_wgrad.h
new file mode 100644
index 0000000000..b0f5b91558
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_conv3d_wgrad.h
@@ -0,0 +1,504 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv3dWgrad;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialzation for Analytic IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dWgrad specialzation for Analytic IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialzation for Optimized IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dWgrad specialzation for Optimized IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
new file mode 100644
index 0000000000..2ec1566889
--- /dev/null
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
@@ -0,0 +1,424 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Implicit GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct ImplicitGemmConvolution {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+ 
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    cutlass::platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    gemm::GemmCoord implicit_gemm_problem_size;
+    int gemm_k_iterations;
+    typename Mma::IteratorA::Params iterator_A;
+    typename Mma::IteratorA::Element const *ptr_A;
+    typename Mma::IteratorB::Params iterator_B;
+    typename Mma::IteratorB::Element const *ptr_B;
+    typename Epilogue::OutputTileIterator::Params iterator_C;
+    typename Epilogue::OutputTileIterator::Element *ptr_C;
+    typename Epilogue::OutputTileIterator::Params iterator_D;
+    typename Epilogue::OutputTileIterator::Element *ptr_D;
+    typename EpilogueOutputOp::Params output_op;
+    int *semaphore;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): gemm_k_iterations(0) { }
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+      grid_tiled_shape(grid_tiled_shape),
+      iterator_A(args.problem_size, args.ref_A.layout()),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C)),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D)),
+      ptr_D(args.ref_D.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode)
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolution() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * Mma::Shape::kM,
+        threadblock_tile_idx.k() * Mma::Shape::kK
+      )
+    );
+    
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_B,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * Mma::Shape::kK,
+        threadblock_tile_idx.n() * Mma::Shape::kN
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+      __threadfence();
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
+    }
+
+    // Run efficient epilogue
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..14c8a4e829
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,240 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv2dDgradFilterTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  static_assert(sizeof_bits<Element>::value >= 8, 
+    "DGRAD requires elements of size 8b or larger.");
+  
+  //
+  // Parameters structure
+  //
+  
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // For a fixed filter position (r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension 
+  int filter_r_;
+  int filter_s_;
+  int offset_k_[ThreadMap::Iterations::kStrided]; 
+  int offset_c_[ThreadMap::Iterations::kContiguous];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_r_(0),
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() 
+        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = 
+        threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the filter tensor w that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int c = offset_c_[iteration_contiguous_];
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, filter_r_, filter_s_, c);
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor w
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K && coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..f76dcde931
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity
+>
+class Conv2dDgradFilterTileAccessIteratorOptimized {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = StrideSupport_;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv2dDgradFilterIteratorOptimizedParams {
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dDgradFilterIteratorOptimizedParams const &base): 
+      Conv2dDgradFilterIteratorOptimizedParams(base) { }
+      
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size, 
+      Layout const &layout
+    ):
+      Conv2dDgradFilterIteratorOptimizedParams(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) { }
+
+  };
+
+private:
+
+  Conv2dDgradFilterIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  uint32_t predicates_;
+  int filter_rs_;
+  int filter_k_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided *
+    ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorOptimized(
+    Conv2dDgradFilterIteratorOptimizedParams const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_(0),
+    filter_rs_(0),
+    filter_k_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.strided();
+    Index column = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided;
+        int filter_c = column + c * ThreadMap::Delta::kContiguous;
+
+        uint32_t pred = ((filter_k < problem_size_.K && filter_c < problem_size_.C) ? 1u : 0);
+
+        int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+        
+        predicates_ |= (pred << pred_idx);
+      }
+    }
+
+    pointer_ += (
+      filter_k_ * params.layout.stride()[2] + column
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_rs;
+
+    // moves to the next tile
+    ++filter_rs_;
+    if (filter_rs_ == params_.RS) {
+
+      filter_rs_ = 0;
+      next = params_.inc_next_k;
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
+        predicates_ = (predicates_ & (~kClearMask));
+      }
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_ & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_ + 
+      iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_strided;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..d32da7c3bf
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,525 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kStrided
+>
+class Conv2dDgradOutputGradientTileAccessIteratorAnalytic;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradOutputGradientTileAccessIteratorAnalytic strided dgrad needs special handling using
+// unscaled coordinations
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv2dDgradOutputGradientTileAccessIteratorAnalytic <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided
+> {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "DGRAD requires elements of size 8b or greater.");
+ 
+  //
+  // Simpligying assertions
+  //
+
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_k_;
+  int filter_r_;
+  int filter_s_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_w_[ThreadMap::Iterations::kStrided];
+  int offset_h_[ThreadMap::Iterations::kStrided];
+  
+private:
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator but DOES NOT scale by the convolution stride. This is needed
+  /// to compute predicates in the valid() method. The return value of the public at()
+  /// method is correctly scaled.
+  CUTLASS_HOST_DEVICE
+  TensorCoord unscaled_at_() const {
+    int n = offset_n_[iteration_strided_];
+    int h = offset_h_[iteration_strided_];
+    int w = offset_w_[iteration_strided_];
+
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h);
+    int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w);
+
+    return TensorCoord(n, p, q, filter_k_);
+  }
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      offset_n_[s] = offset_nhw / (problem_size_.H * problem_size_.W);
+      int residual = offset_nhw % (problem_size_.H * problem_size_.W);
+
+      offset_h_[s] = residual / problem_size_.W;
+      offset_w_[s] = residual % problem_size_.W;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // move to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_  = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+
+    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    TensorCoord coord = unscaled_at_();
+
+    return TensorCoord(
+      coord.n(), 
+      coord.h() / problem_size_.stride_h, 
+      coord.w() / problem_size_.stride_w, 
+      coord.c());
+  }
+
+
+  /// Returns true if the current coordinate is within the output tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord unscaled_coord = unscaled_at_();
+    TensorCoord coord = at();
+
+    return 
+      !(unscaled_coord.h() % problem_size_.stride_h) && !(unscaled_coord.w() % problem_size_.stride_w) &&
+      coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.P &&
+      coord.w() >= 0 && coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradOutputGradientTileAccessIteratorAnalytic for unity strides can be optimized by 
+// eliminating modulo arithmetic to compute unscaled coordinates 
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv2dDgradOutputGradientTileAccessIteratorAnalytic < 
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kUnity
+> {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "DGRAD requires elements of size 8b or greater.");
+ 
+  //
+  // Simpligying assertions
+  //
+
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_k_;
+  int filter_r_;
+  int filter_s_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_w_[ThreadMap::Iterations::kStrided];
+  int offset_h_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      offset_n_[s] = offset_nhw / (problem_size_.H * problem_size_.W);
+      int residual = offset_nhw % (problem_size_.H * problem_size_.W);
+
+      offset_h_[s] = residual / problem_size_.W;
+      offset_w_[s] = residual % problem_size_.W;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // move to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_  = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+
+    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int n = offset_n_[iteration_strided_];
+    int h = offset_h_[iteration_strided_];
+    int w = offset_w_[iteration_strided_];
+
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h) / problem_size_.stride_h;
+    int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w) / problem_size_.stride_w;
+
+    return TensorCoord(n, p, q, filter_k_);
+
+  }
+
+
+  /// Returns true if the current coordinate is within the output tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.P &&
+      coord.w() >= 0 && coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // Conv2dDgradFilterTileAccessIteratorAnalytic unity stride specialization 
+    // only supports (stride_h, stride_w) = (1, 1)
+    if (problem_size.stride() != MatrixCoord({1, 1})) {
+      return Status::kErrorNotSupported;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..71299cf578
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,437 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity
+>
+class Conv2dDgradOutputGradientTileAccessIteratorOptimized {
+public:
+
+  static_assert(StrideSupport_ == conv::StrideSupport::kUnity,
+    "Only unit-stride dgrad is supported at this time.");
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  using Mask = uint64_t;
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv2dDgradOutputGradientIteratorOptimizedParams {
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dDgradOutputGradientIteratorOptimizedParams const &base): 
+      Conv2dDgradOutputGradientIteratorOptimizedParams(base) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size, 
+      Layout const &layout
+    ):
+      Conv2dDgradOutputGradientIteratorOptimizedParams(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) { }
+  };
+
+private:
+
+  Conv2dDgradOutputGradientIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (r, s)
+  int filter_r_;
+  int filter_s_;
+  int filter_k_;
+
+  Index masks_[ThreadMap::Iterations::kStrided][2];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorOptimized(
+    Conv2dDgradOutputGradientIteratorOptimizedParams const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    filter_k_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_h[ThreadMap::Iterations::kStrided];
+    int offset_w[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_nhw / (problem_size_.H * problem_size_.W);
+      //  int residual = offset_nhw % (problem_size_.H * problem_size_.W);
+      //
+      //  offset_h[s] = residual / problem_size_.W;
+      //  offset_w[s] = residual % problem_size_.W;
+      //
+
+      int residual;
+
+      params_.hw_divmod(offset_n[s], residual, offset_nhw);
+      params_.w_divmod(offset_h[s], offset_w[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_h[s], offset_w[s], 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int p = offset_h[s_idx] + problem_size_.pad_h - r_ * problem_size_.dilation_h;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && p >= 0 && p < problem_size_.P);
+        masks_[s_idx][0] |= (pred << r);
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int q = offset_w[s_idx] + problem_size_.pad_w - s_ * problem_size_.dilation_w;
+
+        bool pred = (q >= 0 && q < problem_size_.Q);
+        masks_[s_idx][1] |= (pred << s);
+      }
+    }
+
+    if (filter_k_ >= problem_size.K) {
+      clear_mask();
+    }
+
+    set_iteration_index(0);
+  }
+
+private:
+
+  /// Returns the coordinate in the output gradient tensor dy that is correspoinding to 
+  // output nhw and filter position k, r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int h, int w, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int p = h + problem_size_.pad_h - r * problem_size_.dilation_h;
+    int q = w + problem_size_.pad_w - s * problem_size_.dilation_w;
+
+    return TensorCoord(n, p, q, filter_k_);
+  }
+  
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+  
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask_(bool clear) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      // We are using inline PTX assembly here to avoid an CUDA C++ compilation
+      // artifact in which control flow instructions are generated. Instead, our
+      // intent is to predicate the mov instructions.
+      #if defined(__CUDA_ARCH__)
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][0])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][0])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][1])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][1])
+      );
+      #else
+        if (clear) {
+          masks_[s][0] = 0;
+          masks_[s][1] = 0;
+        }
+      #endif
+    }
+  }
+  
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+ 
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      filter_s_ = 0;
+      ++filter_r_;
+ 
+      if (filter_r_ < problem_size_.R) {
+        next_idx = 1;
+      }
+      else {
+        filter_r_ = 0;
+        next_idx = 2;
+      }
+    }
+    
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 2) {  
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    clear_mask_(filter_k_ >= problem_size_.K);
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][0] = Mask(0);
+      masks_[s][1] = Mask(0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][0] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][1] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorOptimized &operator++() {
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // This is specialized for unit stride
+    if (problem_size.stride() != MatrixCoord({1, 1})) {
+      return Status::kErrorNotSupported;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorNotSupported;
+    }
+
+    // Limit on filter size
+    if (problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..92dd705d6b
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,274 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_
+>
+class Conv2dFpropActivationTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_c_;
+  int filter_r_;
+  int filter_s_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_c_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    
+      offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+      int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+
+      offset_p_[s] = residual / problem_size_.Q;
+      offset_q_[s] = residual % problem_size_.Q;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+    
+    filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - filter_r_);
+      s = (problem_size_.S - 1 - filter_s_);
+    }
+
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, h, w, filter_c_);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..afb015d352
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,438 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_
+>
+class Conv2dFpropActivationTileAccessIteratorOptimized {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  using Mask = uint64_t;
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv2dFpropActivationIteratorOptimizedParams<Layout> {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dFpropActivationIteratorOptimizedParams<Layout> const &base): 
+      Conv2dFpropActivationIteratorOptimizedParams<Layout>(base) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size, 
+      Layout const &layout
+    ):
+      Conv2dFpropActivationIteratorOptimizedParams<Layout>(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) {
+
+    }  
+  };
+
+private:
+
+  Conv2dFpropActivationIteratorOptimizedParams<Layout> const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (r, s)
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  Index masks_[ThreadMap::Iterations::kStrided][2];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorOptimized(
+    Conv2dFpropActivationIteratorOptimizedParams<Layout> const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    filter_c_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_p[ThreadMap::Iterations::kStrided];
+    int offset_q[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+      //  int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+      //
+      //  offset_p[s] = residual / problem_size_.Q;
+      //  offset_q[s] = residual % problem_size_.Q;
+      //
+
+      int residual;
+
+      params.pq_divmod(offset_n[s], residual, offset_npq);
+      params.q_divmod(offset_p[s], offset_q[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_p[s], offset_q[s], 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int h = offset_p[s_idx] * problem_size_.stride_h - problem_size_.pad_h + r_ * problem_size_.dilation_h;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && h >= 0 && h < problem_size_.H);
+        masks_[s_idx][0] |= (pred << r);
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int w = offset_q[s_idx] * problem_size_.stride_w - problem_size_.pad_w + s_ * problem_size_.dilation_w;
+
+        bool pred = (w >= 0 && w < problem_size_.W);
+        masks_[s_idx][1] |= (pred << s);
+      }
+    }
+
+    if (filter_c_ >= problem_size.C) {
+      clear_mask();
+    }
+
+    set_iteration_index(0);
+  }
+
+private:
+
+  /// Returns the coordinate in the activations tensor X that is correspoinding to 
+  // output npq and filter position r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int p, int q, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, h, w, filter_c_);
+  }
+  
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+  
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask_(bool clear) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      // We are using inline PTX assembly here to avoid an CUDA C++ compilation
+      // artifact in which control flow instructions are generated. Instead, our
+      // intent is to predicate the mov instructions.
+      #if defined(__CUDA_ARCH__)
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][0])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][0])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][1])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][1])
+      );
+      #else
+        if (clear) {
+          masks_[s][0] = 0;
+          masks_[s][1] = 0;
+        }
+      #endif
+    }
+  }
+  
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+ 
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      filter_s_ = 0;
+      ++filter_r_;
+ 
+      if (filter_r_ < problem_size_.R) {
+        next_idx = 1;
+      }
+      else {
+        filter_r_ = 0;
+        next_idx = 2;
+      }
+    }
+    
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 2) {  
+      filter_c_ += params_.filter_c_delta;
+    }
+
+    clear_mask_(filter_c_ >= problem_size_.C);
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][0] = Mask(0);
+      masks_[s][1] = Mask(0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][0] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][1] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorOptimized &operator++() {
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Conv2dFpropActivationTileAccessIteratorOptimized has constraint on filter positions 
+    // due to the number of mask bits.
+    if (problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..6547e9c5ba
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,252 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_
+>
+class Conv2dFpropFilterTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_r_(0),
+    filter_s_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+    
+    filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, filter_r_, filter_s_, filter_c_);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (problem_size.K % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (problem_size.K % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..bf0d1d3124
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,282 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_
+>
+class Conv2dFpropFilterTileAccessIteratorOptimized{
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv2dFpropFilterIteratorOptimizedParams<Layout> {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dFpropFilterIteratorOptimizedParams<Layout> const &base): 
+      Conv2dFpropFilterIteratorOptimizedParams<Layout>(base) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout
+    ):
+      Conv2dFpropFilterIteratorOptimizedParams<Layout>(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) {
+
+    }
+  };
+
+private:
+
+  Conv2dFpropFilterIteratorOptimizedParams<Layout> const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  uint32_t predicates_;
+  int filter_rs_;
+  int filter_c_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorOptimized(
+    Conv2dFpropFilterIteratorOptimizedParams<Layout> const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_(0),
+    filter_rs_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+    Index column = threadblock_offset.column() + thread_coord.strided();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < problem_size_.K) ? 1u : 0);
+      predicates_ |= (pred << s);
+    }
+
+    if (filter_c_ >= problem_size.C) {
+      predicates_ = 0u;
+    }
+
+    pointer_ += (
+      params_.layout({filter_c_, column}) 
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_rs;
+
+    // moves to the next tile
+    ++filter_rs_;
+    if (filter_rs_ == params_.RS) {
+
+      filter_rs_ = 0;
+      next = params_.inc_next_c;
+      filter_c_ += params_.filter_c_delta;
+    }
+      
+    if (filter_c_ >= problem_size_.C) {
+      predicates_ = 0;
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return (predicates_ & (1u << iteration_strided_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_k;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (problem_size.K % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (problem_size.K % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/conv2d_params.h b/include/cutlass/conv/threadblock/conv2d_params.h
new file mode 100644
index 0000000000..ac6b2e3095
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_params.h
@@ -0,0 +1,609 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file 
+  \brief Extracts the host-params objects into non-template code.
+*/
+
+#pragma once
+
+#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+#include <fstream>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Params structure used for all Conv2d analytic tile iterators
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dAnalyticParams {
+
+  using Layout = Layout_;
+
+  Layout layout;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dAnalyticParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dAnalyticParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout
+  ): layout(layout) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+
+CUTLASS_HOST_DEVICE
+void TraceIteratorParams(
+  char const *conv_operator, 
+  char const *operand,
+  int element_size_bits,
+  MatrixCoord threadblock_shape,
+  int thread_count,
+  int access_size,
+  layout::PitchLinearCoord threadmap_iterations,
+  layout::PitchLinearCoord threadmap_delta
+) {
+ 
+#if !defined(__CUDA_ARCH__)
+
+  char const *fname = "conv_iterator_params.csv";
+
+  std::ifstream test(fname);
+  bool file_exists = test.is_open();
+
+  if (file_exists) {
+    test.close();
+  }
+ 
+  std::ofstream trace("conv_iterator_params.csv", std::ofstream::app);
+
+  if (!file_exists) {
+    trace 
+      << "Operator,Operand,ElementSize,CtaRows,CtaColumns,ThreadCount,AccessSize,"
+      << "IterationsContiguous,IterationsStrided,DeltaContiguous,DeltaStrided\n";
+  }
+
+  trace << conv_operator << "," << operand << "," << element_size_bits << "," 
+    << threadblock_shape.row() << "," << threadblock_shape.column()
+    << "," << thread_count << "," << access_size 
+    << "," << threadmap_iterations.contiguous() << "," << threadmap_iterations.strided()
+    << "," << threadmap_delta.contiguous() << "," << threadmap_delta.strided() << "\n";
+#endif
+}
+
+#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) \
+  TraceIteratorParams(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta);
+
+#else
+
+#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) {}
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dFpropActivationIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
+template<>
+struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNHWC> {
+  
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int64_t inc_next[3];    // {next S, next R, next C}
+  int filter_c_delta;     // number of logical elements to add to filter_c_
+  int PQ;                 // product of P*Q
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), PQ(problem_size.P * problem_size.Q), pq_divmod(PQ), q_divmod(problem_size.Q) {
+
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
+
+    // next S
+    inc_next[0] = conv_sign * (int64_t(layout.stride()[0]) * problem_size.dilation_w) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[1]) * problem_size.dilation_h
+        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next C
+    inc_next[2] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+
+/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
+template <int Interleaved_>
+struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNCxHWx<Interleaved_>> {
+  static int const kInterleaved = Interleaved_;
+ 
+  using Layout = layout::TensorNCxHWx<kInterleaved>;
+
+  Layout layout;
+
+  int64_t inc_next[3];    // {next S, next R, next C}
+  int filter_c_delta;     // number of logical elements to add to filter_c_
+  int PQ;                 // product of P*Q
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), PQ(problem_size.P * problem_size.Q), pq_divmod(PQ), q_divmod(problem_size.Q) {
+
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
+
+    // next S
+    inc_next[0] = conv_sign * (kInterleaved * problem_size.dilation_w) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[0]) * problem_size.dilation_h
+        - (problem_size.S - 1) * kInterleaved * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next C
+    inc_next[2] = (
+        threadblock_shape.column() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[1])
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[0] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * kInterleaved * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dFpropFilterIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Conv2dFpropFilterIteratorOptimizedParams<layout::TensorNHWC>
+{
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  int RS;
+  int filter_c_delta;
+
+  int64_t inc_next_k;         // offset in units of bytes to next K position
+  int64_t inc_next_rs;        // offset in units of bytes to next RS position
+  int64_t inc_next_c;         // offset in units of bytes to next C position
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    RS = problem_size.R * problem_size.S;
+
+    inc_next_k = (int64_t(layout.stride()[2]) * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_rs =
+      ( int64_t(layout.stride()[0])
+        - int64_t(layout.stride()[2]) * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
+      ) * element_size_bits / 8;
+
+    inc_next_c =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices
+        - int64_t(RS - 1) * layout.stride()[0]
+        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+template<int Interleaved_>
+struct Conv2dFpropFilterIteratorOptimizedParams<layout::TensorCxRSKx<Interleaved_>>
+{
+  static int const kInterleaved = Interleaved_;
+  using Layout = layout::TensorCxRSKx<kInterleaved>;
+
+  Layout layout;
+  int RS;
+  int filter_c_delta;
+
+  int64_t inc_next_k;         // offset in units of bytes to next K position
+  int64_t inc_next_rs;        // offset in units of bytes to next RS position
+  int64_t inc_next_c;         // offset in units of bytes to next C position
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    RS = problem_size.R * problem_size.S;
+
+    inc_next_k = (kInterleaved * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_rs =
+      (  int64_t(layout.stride()[0])
+        - kInterleaved * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
+      ) * element_size_bits / 8;
+
+    inc_next_c =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[2])
+        - int64_t(RS - 1) * layout.stride()[0]
+        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * kInterleaved 
+      ) * element_size_bits / 8;
+
+    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+/// Parameters object for Conv2d DGRAD OutputGradient (dy) iterator
+struct Conv2dDgradOutputGradientIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int64_t inc_next[3];    // {next S, next R, next K}
+
+  int filter_k_delta;     // number of logical elements to add to filter_k_
+
+  int HW;                   // product of H*W
+
+  FastDivmod hw_divmod;
+  FastDivmod w_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), HW(problem_size.H *problem_size.W), hw_divmod(HW), w_divmod(problem_size.W) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_dgrad", "output_gradient", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
+
+    // next S
+    inc_next[0] = conv_sign * (layout.stride()[0] * problem_size.dilation_w) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        layout.stride()[1] * problem_size.dilation_h
+        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next K
+    inc_next[2] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+
+/// Parameters object for Conv2d DGRAD Filter (w) iterator
+struct Conv2dDgradFilterIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  int RS;
+  int filter_k_delta;
+
+  int64_t inc_next_strided;   // offset in units of bytes to next K coordinate within tile
+  int64_t inc_next_rs;        // offset in units of bytes to next RS position
+  int64_t inc_next_k;         // offset in units of bytes to next K position in subsequent tile
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), RS(problem_size.R * problem_size.S) {
+
+    TRACE_CONV_INITIALIZERS("conv2d_dgrad", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    inc_next_strided = (layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_rs =
+      ( layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    inc_next_k =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices * layout.stride()[2]
+        - (problem_size.R * problem_size.S - 1) * layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters object for Conv2d WGRAD Output Gradient (dy) iterator
+struct Conv2dWgradOutputGradientIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int NPQ;                      // precomputd product of N*P*Q for clearing predicates
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  int64_t offset_next_strided;    // offset in units of bytes to next npq coordinate within tile
+  int64_t offset_next_contiguous; // offset in units of bytes to next k coordinate within tile
+  int64_t inc_next_npq;           // offset in units of bytes to next npq position in subsequent tile
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ):
+    layout(layout),
+    NPQ(problem_size.N * problem_size.P * problem_size.Q),
+    pq_divmod(problem_size.P * problem_size.Q),
+    q_divmod(problem_size.Q) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_wgrad", "output_gradient", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    // Incremental offsets in unites of bytes (number of elements) * sizeof_bits<Element>::value / 8
+    offset_next_strided = (threadmap_delta.strided() * layout.stride()[0])
+                        * element_size_bits / 8;
+
+    offset_next_contiguous = (threadmap_delta.contiguous())
+                            * element_size_bits / 8;
+
+    inc_next_npq = (threadblock_shape.column() * problem_size.split_k_slices * layout.stride()[0])
+                      * element_size_bits / 8;
+  }
+};
+
+struct Conv2dWgradActivationIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  FastDivmod sc_divmod;
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+  FastDivmod c_divmod;
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout
+  ):
+    layout(layout),
+    sc_divmod(problem_size.S * problem_size.C),
+    pq_divmod(problem_size.P * problem_size.Q),
+    q_divmod(problem_size.Q),
+    c_divmod(problem_size.C) {
+
+  }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ):
+    Conv2dWgradActivationIteratorOptimizedParams(
+      problem_size,
+      layout
+    ) { 
+    
+      TRACE_CONV_INITIALIZERS("conv2d_wgrad", "activation", 
+        element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
new file mode 100644
index 0000000000..ce52017e37
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
@@ -0,0 +1,170 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template wraps the tile access iterator concept to load whole tiles from tensors in
+      memory used for implicit GEMM convolution.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename TileAccessIterator_>
+class TileIterator {
+public:
+  using TileAccessIterator = TileAccessIterator_;
+
+  using Shape = typename TileAccessIterator::Shape;
+  using Element = typename TileAccessIterator::Element;
+  using Layout = typename TileAccessIterator::Layout;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = typename TileAccessIterator::ThreadMap;
+  using AccessType = typename TileAccessIterator::AccessType;
+  using TensorRef = typename TileAccessIterator::TensorRef;
+  using Index = typename TileAccessIterator::Index;
+  using LongIndex = typename TileAccessIterator::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = TileAccessIterator::kIteratorAlgorithm;
+  static StrideSupport const kStrideSupport = TileAccessIterator::kStrideSupport;
+  using Params = typename TileAccessIterator::Params;
+  static int const kConvDim = TileAccessIterator::kConvDim;
+  using ConvProblemSize = typename TileAccessIterator::ConvProblemSize;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+    Element, 
+    ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+private:
+
+  /// Internal state
+  TileAccessIterator tile_access_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TileIterator(
+    Params const &params,
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    tile_access_iterator_(params, problem_size, ptr, thread_idx, threadblock_offset) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    tile_access_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  TileIterator &operator++() {
+    tile_access_iterator_.advance();
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  TileIterator operator++(int) {
+    TileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.clear();
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        cutlass::arch::global_load<
+          AccessType,
+          sizeof(AccessType)
+        >(
+          frag_ptr[c + s * ThreadMap::Iterations::kContiguous],
+          tile_access_iterator_.get() + pointer_offset,
+          tile_access_iterator_.valid()
+        );
+
+        ++tile_access_iterator_;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    tile_access_iterator_.set_iteration_index(0);
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+    tile_access_iterator_.advance();
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // dispatch to iterator implementation
+    return TileAccessIterator::can_implement(problem_size);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..13d8338c2f
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv2dWgradActivationTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // Filter postion (r,s,c) in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_r_[ThreadMap::Iterations::kContiguous];
+  int filter_s_[ThreadMap::Iterations::kContiguous];
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_npq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr))
+  {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int rsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      filter_r_[c] = rsc_offset / (problem_size_.S * problem_size_.C);
+      int residual = rsc_offset % (problem_size_.S * problem_size_.C);
+
+      filter_s_[c] = residual / problem_size_.C;
+      filter_c_[c] = residual % problem_size_.C;
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_npq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int r = filter_r_[iteration_contiguous_];
+    int s = filter_s_[iteration_contiguous_];
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int n = offset_npq_[iteration_strided_] / (problem_size_.P * problem_size_.Q);
+    int residual = offset_npq_[iteration_strided_] % (problem_size_.P * problem_size_.Q);
+   
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+ 
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, h, w, filter_c_[iteration_contiguous_]);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..74a887794b
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,273 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv2dWgradActivationTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dWgradActivationIteratorOptimizedParams;
+
+private:
+
+  Conv2dWgradActivationIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // Precomputed effective filter postion (r,s) in contiguous dimension stays constant for each gemm_iteration_k
+  // required for npq -> nhw translation
+  int precomputed_filter_r_[ThreadMap::Iterations::kContiguous];
+  int precomputed_filter_s_[ThreadMap::Iterations::kContiguous];
+
+  // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_npq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorOptimized(
+    Conv2dWgradActivationIteratorOptimizedParams const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr))
+  {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int rsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      // filter_r_[c] = rsc_offset / (problem_size_.S * problem_size_.C);
+      // int residual = rsc_offset % (problem_size_.S * problem_size_.C);
+      //
+      // filter_s_[c] = residual / problem_size_.C;
+      // filter_c_[c] = residual % problem_size_.C;
+
+      int residual;
+      params_.sc_divmod(precomputed_filter_r_[c], residual, rsc_offset);
+      params_.c_divmod(precomputed_filter_s_[c], filter_c_[c], residual);
+
+      int r = precomputed_filter_r_[c];
+      int s = precomputed_filter_s_[c];
+
+      if (problem_size_.mode == Mode::kConvolution) {
+        r = (problem_size_.R - 1 - r);
+        s = (problem_size_.S - 1 - s);
+      }
+
+      precomputed_filter_r_[c] =  - problem_size_.pad_h + r * problem_size_.dilation_h;
+      precomputed_filter_s_[c] =  - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_npq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int n = offset_npq_[iteration_strided_] / (problem_size_.P * problem_size_.Q);
+    // int residual = offset_npq_[iteration_strided_] % (problem_size_.P * problem_size_.Q);
+    //
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+
+    int residual, n, p, q;
+    
+    params_.pq_divmod(n, residual, offset_npq_[iteration_strided_]);
+    params_.q_divmod(p, q, residual);
+
+    int h = p * problem_size_.stride_h + precomputed_filter_r_[iteration_contiguous_];
+    int w = q * problem_size_.stride_w + precomputed_filter_s_[iteration_contiguous_];
+
+    return TensorCoord(n, h, w, filter_c_[iteration_contiguous_]);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..84c788d6d4
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv2dWgradOutputGradientTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_k_[ThreadMap::Iterations::kContiguous];
+
+  int offset_npq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize filter_k for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      filter_k_[c] = threadblock_offset.row() + thread_coord.contiguous() 
+                        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] = threadblock_offset.column() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;  
+      
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] += Shape::kColumn * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the output gradient tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int npq = offset_npq_[iteration_strided_];
+
+    int n = npq / (problem_size_.P * problem_size_.Q);
+    int residual = npq % (problem_size_.P * problem_size_.Q);
+
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+
+    return TensorCoord(n, p, q, filter_k_[iteration_contiguous_]);
+  }
+
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() < problem_size_.P &&
+      coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..4a20cb1d8b
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,300 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv2dWgradOutputGradientTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv2dWgradOutputGradientIteratorOptimizedParams {
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dWgradOutputGradientIteratorOptimizedParams const &base): 
+      Conv2dWgradOutputGradientIteratorOptimizedParams(base) { }
+      
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size, 
+      Layout const &layout
+    ): 
+      Conv2dWgradOutputGradientIteratorOptimizedParams(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) { }
+  };
+
+private:
+
+  Conv2dWgradOutputGradientIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  uint32_t predicates_;
+  int filter_k_;
+  int offset_npq_;
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorOptimized(
+    Conv2dWgradOutputGradientIteratorOptimizedParams const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_(0),
+    filter_k_(0),
+    offset_npq_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.contiguous();
+    offset_npq_ = threadblock_offset.column() + thread_coord.strided();
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + c * ThreadMap::Delta::kContiguous;
+        int offset_npq = offset_npq_ + s * ThreadMap::Delta::kStrided;
+
+        bool predicate = valid_(at_(offset_npq, filter_k));
+
+        uint32_t pred = (predicate ? 1u : 0);
+
+        int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+        
+        predicates_ |= (pred << pred_idx);
+      }
+    }
+
+    // Offset pointer to (iteration_strided_, iteration_contiguous_) = (0, 0) 
+    pointer_ += (
+      offset_npq_ * params.layout.stride()[0] + filter_k_
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
+    offset_npq_ += Shape::kColumn * problem_size_.split_k_slices;
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (offset_npq_ + s * ThreadMap::Delta::kStrided >= params_.NPQ) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
+        predicates_ = (predicates_ & (~kClearMask));
+      }
+    }
+
+    pointer_ += params_.inc_next_npq; 
+  }
+
+private:
+  /// Returns the coordinate in the output gradient tensor Dy that is pointed to
+  /// by offset_npq and k.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int offset_npq, int k) const {
+
+    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int npq = offset_npq;
+    // int n = npq / (problem_size_.P * problem_size_.Q);
+    // int residual = npq % (problem_size_.P * problem_size_.Q);
+    // 
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+    
+    int residual, n, p, q;
+    
+    params_.pq_divmod(n, residual, offset_npq);
+    params_.q_divmod(p, q, residual);
+
+    return TensorCoord(n, p, q, k);
+  }
+  
+  /// Returns true if the coord is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid_(TensorCoord coord) const {
+
+    return coord.n() < problem_size_.N &&
+      coord.c() < problem_size_.K;
+  }
+
+public:
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_ & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(
+      pointer_ +
+      iteration_strided_ * params_.offset_next_strided + 
+      iteration_contiguous_ * params_.offset_next_contiguous
+    );
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..0033568278
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,263 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dDgradFilterTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  static_assert(sizeof_bits<Element>::value >= 8, 
+    "DGRAD requires elements of size 8b or larger.");
+  
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // For a fixed filter position (t,r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension 
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int offset_k_[ThreadMap::Iterations::kStrided]; 
+  int offset_c_[ThreadMap::Iterations::kContiguous];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    filter_t_(0), 
+    filter_r_(0),
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() 
+        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = 
+        threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+     ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the filter tensor w that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int c = offset_c_[iteration_contiguous_];
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, filter_t_, filter_r_, filter_s_, c);
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor w
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K && coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..47e7de46a0
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,331 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kStrided
+>
+class Conv3dDgradOutputGradientTileAccessIteratorAnalytic;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv3dDgradOutputGradientTileAccessIteratorAnalytic strided dgrad needs special handling using
+// unscaled coordinations
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dDgradOutputGradientTileAccessIteratorAnalytic <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided
+> {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "DGRAD requires elements of size 8b or greater.");
+ 
+  //
+  // Simpligying assertions
+  //
+
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ConvProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_k_;
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_d_[ThreadMap::Iterations::kStrided];
+  int offset_w_[ThreadMap::Iterations::kStrided];
+  int offset_h_[ThreadMap::Iterations::kStrided];
+  
+private:
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator but DOES NOT scale by the convolution stride. This is needed
+  /// to compute predicates in the valid() method. The return value of the public at()
+  /// method is correctly scaled.
+  CUTLASS_HOST_DEVICE
+  TensorCoord unscaled_at_() const {
+    int n = offset_n_[iteration_strided_];
+    int d = offset_d_[iteration_strided_];
+    int h = offset_h_[iteration_strided_];
+    int w = offset_w_[iteration_strided_];
+
+    int t = filter_t_;
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = (problem_size_.T - 1 - t);
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int z = (d + problem_size_.pad_d - t * problem_size_.dilation_d);
+    int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h);
+    int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w);
+
+    return TensorCoord(n, z, p, q, filter_k_);
+  }
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0), 
+    filter_t_(0),
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_ndhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      offset_n_[s] = offset_ndhw / (problem_size_.D * problem_size_.H * problem_size_.W);
+      int residual = offset_ndhw % (problem_size_.D * problem_size_.H * problem_size_.W);
+
+      offset_d_[s] = residual / (problem_size_.H * problem_size_.W);
+      residual     = residual % (problem_size_.H * problem_size_.W);
+
+      offset_h_[s] = residual / problem_size_.W;
+      offset_w_[s] = residual % problem_size_.W;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // move to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_  = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+    ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    TensorCoord coord = unscaled_at_();
+
+    return TensorCoord(
+      coord.n(), 
+      coord.d() / problem_size_.stride_d, 
+      coord.h() / problem_size_.stride_h, 
+      coord.w() / problem_size_.stride_w, 
+      coord.c());
+  }
+
+
+  /// Returns true if the current coordinate is within the output tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord unscaled_coord = unscaled_at_();
+    TensorCoord coord = at();
+
+    return 
+      !(unscaled_coord.d() % problem_size_.stride_d) &&
+      !(unscaled_coord.h() % problem_size_.stride_h) && 
+      !(unscaled_coord.w() % problem_size_.stride_w) &&
+      coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.Z &&
+      coord.h() >= 0 && coord.h() < problem_size_.P &&
+      coord.w() >= 0 && coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..f5d14b5b10
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,296 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dFpropActivationTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ConvProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_z_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_t_(0),
+    filter_r_(0), 
+    filter_s_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_nzpq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    
+      offset_n_[s] = offset_nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+      int residual = offset_nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+
+      offset_z_[s] = residual / (problem_size_.P * problem_size_.Q);
+      residual     = residual % (problem_size_.P * problem_size_.Q);
+
+      offset_p_[s] = residual / problem_size_.Q;
+      offset_q_[s] = residual % problem_size_.Q;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+    ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int z = offset_z_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int t = filter_t_;
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = (problem_size_.T - 1 - filter_t_);
+      r = (problem_size_.R - 1 - filter_r_);
+      s = (problem_size_.S - 1 - filter_s_);
+    }
+
+    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, d, h, w, filter_c_);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.D &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..bad6598baf
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,262 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dFpropFilterTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ConvProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    filter_t_(0),
+    filter_r_(0),
+    filter_s_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+
+    ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, filter_t_, filter_r_, filter_s_, filter_c_);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..0ad49abd31
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,281 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradActivationTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // Filter postion (t,r,s,c) in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_t_[ThreadMap::Iterations::kContiguous];
+  int filter_r_[ThreadMap::Iterations::kContiguous];
+  int filter_s_[ThreadMap::Iterations::kContiguous];
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_nzpq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize t,r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int trsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      filter_t_[c] = trsc_offset / (problem_size_.R * problem_size_.S * problem_size_.C);
+      int residual = trsc_offset % (problem_size_.R * problem_size_.S * problem_size_.C);
+
+      filter_r_[c] = residual / (problem_size_.S * problem_size_.C);
+      residual = residual % (problem_size_.S * problem_size_.C);
+
+      filter_s_[c] = residual / problem_size_.C;
+      filter_c_[c] = residual % problem_size_.C;
+
+    }
+
+    // initialize n, z, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_nzpq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int t = filter_t_[iteration_contiguous_];
+    int r = filter_r_[iteration_contiguous_];
+    int s = filter_s_[iteration_contiguous_];
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = (problem_size_.T - 1 - t);
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int n = offset_nzpq_[iteration_strided_] / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    int residual = offset_nzpq_[iteration_strided_] % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+
+    int z = residual / (problem_size_.P * problem_size_.Q);
+    residual = residual % (problem_size_.P * problem_size_.Q);
+
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+ 
+    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, d, h, w, filter_c_[iteration_contiguous_]);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.D &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..35c4643052
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,346 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradActivationTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    int RSC;                  // product of R*S*C
+    unsigned rsc_mul;         // precomputed quantities for fast computation of div/% by RSC
+    unsigned rsc_shr;         //    in device code.
+
+    int SC;                   // product of S*C
+    unsigned sc_mul;          // precomputed quantities for fast computation of div/% by SC
+    unsigned sc_shr;          //    in device code.
+
+    unsigned c_mul;          // precomputed quantities for fast computation of div/% by C
+    unsigned c_shr;          //    in device code.
+
+    int ZPQ;                 // product of Z*P*Q
+    unsigned zpq_mul;        // precomputed quantities for fast computation of div/% by ZPQ
+    unsigned zpq_shr;        //    in device code.
+
+    int PQ;                  // product of P*Q
+    unsigned pq_mul;         // precomputed quantities for fast computation of div/% by PQ
+    unsigned pq_shr;         //    in device code.
+
+    unsigned q_mul;          // precomputed quantities for fast computation of div/% by Q
+    unsigned q_shr;          //    in device code.
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+      // Precompute several quantities for fast modulo arithmetic.
+      RSC = problem_size.R * problem_size.S * problem_size.C;
+      find_divisor(rsc_mul, rsc_shr, RSC);
+
+      SC = problem_size.S * problem_size.C;
+      find_divisor(sc_mul, sc_shr, SC);
+      
+      find_divisor(c_mul, c_shr, problem_size.C);
+
+      ZPQ = problem_size.Z * problem_size.P * problem_size.Q;
+      find_divisor(zpq_mul, zpq_shr, ZPQ);
+
+      PQ = problem_size.P * problem_size.Q;
+      find_divisor(pq_mul, pq_shr, PQ);
+
+      find_divisor(q_mul, q_shr, problem_size.Q);
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // Precomputed effective filter postion (t,r,s) in contiguous dimension stays constant for each gemm_iteration_k
+  // required for nzpq -> ndhw translation
+  int precomputed_filter_t_[ThreadMap::Iterations::kContiguous];
+  int precomputed_filter_r_[ThreadMap::Iterations::kContiguous];
+  int precomputed_filter_s_[ThreadMap::Iterations::kContiguous];
+
+  // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_nzpq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorOptimized(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize t,r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int trsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      // 
+      // filter_t_[c] = trsc_offset / (problem_size_.R * problem_size_.S * problem_size_.C);
+      // int residual = trsc_offset % (problem_size_.R * problem_size_.S * problem_size_.C);
+      //
+      // filter_r_[c] = residual / (problem_size_.S * problem_size_.C);
+      // residual = residual % (problem_size_.S * problem_size_.C);
+      //
+      // filter_s_[c] = residual / problem_size_.C;
+      // filter_c_[c] = residual % problem_size_.C;
+
+      int residual;
+      fast_divmod(precomputed_filter_t_[c], residual, trsc_offset, params_.RSC, params_.rsc_mul, params_.rsc_shr);
+      fast_divmod(precomputed_filter_r_[c], residual, residual, params_.SC, params_.sc_mul, params_.sc_shr);
+      fast_divmod(precomputed_filter_s_[c], filter_c_[c], residual, problem_size_.C, params_.c_mul, params_.c_shr);
+
+      int t = precomputed_filter_t_[c];
+      int r = precomputed_filter_r_[c];
+      int s = precomputed_filter_s_[c];
+
+      if (problem_size_.mode == Mode::kConvolution) {
+        t = (problem_size_.T - 1 - t);
+        r = (problem_size_.R - 1 - r);
+        s = (problem_size_.S - 1 - s);
+      }
+      
+      // efective t,r,s for every contiguous dimension
+      precomputed_filter_t_[c] = - problem_size_.pad_d + t * problem_size_.dilation_d;
+      precomputed_filter_r_[c] = - problem_size_.pad_h + r * problem_size_.dilation_h;
+      precomputed_filter_s_[c] = - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+
+    }
+
+    // initialize n, z, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_nzpq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int n = offset_nzpq_[iteration_strided_] / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    // int residual = offset_nzpq_[iteration_strided_] % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    // 
+    // int z = residual / (problem_size_.P * problem_size_.Q);
+    // residual = residual % (problem_size_.P * problem_size_.Q);
+    // 
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+
+    int residual, n, z, p, q;
+    fast_divmod(n, residual, offset_nzpq_[iteration_strided_], params_.ZPQ, params_.zpq_mul, params_.zpq_shr);
+    fast_divmod(z, residual, residual, params_.PQ, params_.pq_mul, params_.pq_shr);
+    fast_divmod(p, q, residual, problem_size_.Q, params_.q_mul, params_.q_shr);
+ 
+    int d = z * problem_size_.stride_d + precomputed_filter_t_[iteration_contiguous_];
+    int h = p * problem_size_.stride_h + precomputed_filter_r_[iteration_contiguous_];;
+    int w = q * problem_size_.stride_w + precomputed_filter_s_[iteration_contiguous_];
+
+    return TensorCoord(n, d, h, w, filter_c_[iteration_contiguous_]);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.D &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000..74017c09f6
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,256 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradOutputGradientTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+    
+  int filter_k_[ThreadMap::Iterations::kContiguous];
+
+  int offset_nzpq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize filter_k for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      filter_k_[c] = threadblock_offset.row() + thread_coord.contiguous() 
+                        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] = threadblock_offset.column() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;  
+      
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-A by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] += Shape::kColumn * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the output gradient tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int nzpq = offset_nzpq_[iteration_strided_];
+
+    int n = nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    int residual = nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+
+    int z = residual / (problem_size_.P * problem_size_.Q);
+    residual = residual % (problem_size_.P * problem_size_.Q);
+
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+
+    return TensorCoord(n, z, p, q, filter_k_[iteration_contiguous_]);
+  }
+
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() < problem_size_.Z &&
+      coord.h() < problem_size_.P &&
+      coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000..2cab09d1f3
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,330 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradOutputGradientTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    int NZPQ;                // precomputd product of N*Z*P*Q for clearing predicates
+    int ZPQ;                 // product of Z*P*Q
+    unsigned zpq_mul;        // precomputed quantities for fast computation of div/% by ZPQ
+    unsigned zpq_shr;        //    in device code.
+
+    int PQ;                  // product of P*Q
+    unsigned pq_mul;         // precomputed quantities for fast computation of div/% by PQ
+    unsigned pq_shr;         //    in device code.
+
+    unsigned q_mul;          // precomputed quantities for fast computation of div/% by Q
+    unsigned q_shr;          //    in device code.
+
+    LongIndex offset_next_strided;     // offset in units of bytes to next nzpq coordinate within tile
+    LongIndex offset_next_contiguous;  // offset in units of bytes to next k coordinate within tile
+    LongIndex inc_next_nzpq;           // offset in units of bytes to next nzpq position in subsequent tile
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+      // Incremental offsets in unites of bytes (number of elements) * sizeof_bits<Element>::value / 8
+      offset_next_strided = (ThreadMap::Delta::kStrided * layout.stride()[0])
+                          * sizeof_bits<Element>::value / 8;
+
+      offset_next_contiguous = (ThreadMap::Delta::kContiguous) 
+                              * sizeof_bits<Element>::value / 8;
+
+      inc_next_nzpq = (Shape::kColumn * problem_size.split_k_slices * layout.stride()[0])
+                        * sizeof_bits<Element>::value / 8;
+
+      // Precompute several quantities for fast modulo arithmetic.
+      NZPQ = problem_size.N * problem_size.Z * problem_size.P * problem_size.Q;
+      ZPQ = problem_size.Z * problem_size.P * problem_size.Q;
+      find_divisor(zpq_mul, zpq_shr, ZPQ);
+
+      PQ = problem_size.P * problem_size.Q;
+      find_divisor(pq_mul, pq_shr, PQ);
+
+      find_divisor(q_mul, q_shr, problem_size.Q);
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+    
+  uint32_t predicates_;
+  int filter_k_;
+  int offset_nzpq_;
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorOptimized(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_(0),
+    filter_k_(0),
+    offset_nzpq_(0) {
+
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.contiguous();
+    offset_nzpq_ = threadblock_offset.column() + thread_coord.strided();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + c * ThreadMap::Delta::kContiguous;
+        int offset_nzpq = offset_nzpq_ + s * ThreadMap::Delta::kStrided;
+
+        bool predicate = valid_(at_(offset_nzpq, filter_k));
+
+        uint32_t pred = (predicate ? 1u : 0);
+
+        int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+        
+        predicates_ |= (pred << pred_idx);
+      }
+    }
+
+    // Offset pointer to (iteration_strided_, iteration_contiguous_) = (0, 0) 
+    pointer_ += (
+      offset_nzpq_ * params.layout.stride()[0] + filter_k_
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
+    offset_nzpq_ += Shape::kColumn * problem_size_.split_k_slices;
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (offset_nzpq_ + s * ThreadMap::Delta::kStrided >= params_.NZPQ) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
+        predicates_ = (predicates_ & (~kClearMask));
+      }
+    }
+    pointer_ += params_.inc_next_nzpq; 
+  }
+
+private:
+  /// Returns the coordinate in the output gradient tensor Dy that is (offset_nzpq, k) pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int offset_nzpq, int k) const {
+
+    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int nzpq = offset_nzpq_;
+    // int n = nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    // int residual = nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    //
+    // int z = residual / (problem_size_.P * problem_size_.Q);
+    // residual = residual % (problem_size_.P * problem_size_.Q);
+    //
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+
+    int residual, n, z, p, q;
+    fast_divmod(n, residual, offset_nzpq, params_.ZPQ, params_.zpq_mul, params_.zpq_shr);
+    fast_divmod(z, residual, residual, params_.PQ, params_.pq_mul, params_.pq_shr);
+    fast_divmod(p, q, residual, problem_size_.Q, params_.q_mul, params_.q_shr);
+
+    return TensorCoord(n, z, p, q, k);
+  }
+
+  /// Returns true if the coord is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid_(TensorCoord coord) const {
+
+    return coord.n() < problem_size_.N &&
+      coord.c() < problem_size_.K;
+  }
+
+public:
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_ & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    
+    return reinterpret_cast<AccessType const *>(
+      pointer_ +
+      iteration_strided_ * params_.offset_next_strided + 
+      iteration_contiguous_ * params_.offset_next_contiguous
+    );
+
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
new file mode 100644
index 0000000000..1702847c10
--- /dev/null
+++ b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
@@ -0,0 +1,480 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class ImplicitGemmMultistage : 
+  public gemm::threadblock::MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+
+  using ElementC = typename Policy::Operator::ElementC;
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  ImplicitGemmMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(
+    IteratorA &iterator_A, IteratorB &iterator_B,
+    int group_start_A = 0, int group_start_B = 0) {
+
+    iterator_A.set_iteration_index(group_start_A);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+      
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr, iterator_A.get(), iterator_A.valid());
+
+        ++iterator_A;
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B);
+
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr, iterator_B.get(), iterator_B.valid());
+
+        ++iterator_B;
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      ///< Imaginary strides used for planar-complex only - ignored here
+      int64_t imag_stride_A = 0,
+      int64_t imag_stride_B = 0) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorA::AccessType *>(
+            this->smem_iterator_A_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+          dst_ptr, iterator_A.get(), iterator_A.valid());
+
+        ++iterator_A;
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB::AccessType *>(
+              this->smem_iterator_B_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorB::Element>::value *
+            IteratorB::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+            dst_ptr, iterator_B.get(), iterator_B.valid());
+
+        ++iterator_B;
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.advance();
+      iterator_B.advance();
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A, iterator_B);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        } else {
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+
+        copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A,
+                               group_start_iteration_B);
+
+        warp_mma(
+                accum, 
+                warp_transformed_frag_A[warp_mma_k % 2],
+                 warp_transformed_frag_B[warp_mma_k % 2], 
+                 accum
+                );
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.advance();
+          iterator_B.advance();
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+        }
+      }
+
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
new file mode 100644
index 0000000000..0d56ab6b3f
--- /dev/null
+++ b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
@@ -0,0 +1,313 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Transformation applied to A operand
+  typename TransformA_ = NumericArrayConverter<
+    typename SmemIteratorA_::Element, 
+    typename IteratorA_::Element, 
+    IteratorA_::Fragment::kElements>,
+  ///
+  /// Transformation applied to A operand
+  typename TransformB_ = NumericArrayConverter<
+    typename SmemIteratorB_::Element, 
+    typename IteratorB_::Element, 
+    IteratorB_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class ImplicitGemmPipelined : public gemm::threadblock::MmaBase<Shape_, Policy_, 2> {
+public:
+
+  ///< Base class
+  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, 2>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  ImplicitGemmPipelined(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum,                       ///< source accumulator tile
+    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+    
+          ++iterator_A;
+          ++iterator_B;
+        }
+
+        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
+                 warp_frag_B[warp_mma_k % 2], accum);
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h
index 1f624f1fa8..bd69a707d3 100644
--- a/include/cutlass/core_io.h
+++ b/include/cutlass/core_io.h
@@ -38,6 +38,9 @@
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/tensor_view.h"
 #include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -156,13 +159,23 @@ namespace gemm {
 template <int M, int N, int K>
 inline
 std::ostream & operator<<(std::ostream &out, GemmShape<M,N,K> const &gemm_shape) {
-  out << "cutlass::GemmShape::(kM, kN, kK) {"
+  out << "cutlass::gemm::GemmShape::(kM, kN, kK) {"
     << cutlass::gemm::GemmShape<M,N,K>::kM <<","
     << cutlass::gemm::GemmShape<M,N,K>::kN <<","
     << cutlass::gemm::GemmShape<M,N,K>::kK << "}";
   return out;
 }
 
+/// Default printing to ostream for GemmCoord
+inline
+std::ostream & operator<<(std::ostream &out, GemmCoord const &gemm_coord) {
+  out << "cutlass::gemm::GemmCoord:: {"
+    << gemm_coord.m() <<","
+    << gemm_coord.n() <<","
+    << gemm_coord.k() << "}";
+  return out;
+}
+
 } //namespace gemm
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -185,5 +198,44 @@ std::ostream & operator<<(std::ostream &out, PitchLinearShape<Contiguous, Stride
 } //namespace layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                         stream operators for cutlass::conv namespace                          //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace conv {
+/// Default printing to ostream for Conv2dProblemSize
+inline
+std::ostream& operator<<(std::ostream& out, Conv2dProblemSize const& problem) {
+  out << "NHWC: (" << problem.N << ", " << problem.H << ", " << problem.W << ", " << problem.C << ")" << std::endl
+      << "KRSC: (" << problem.K << ", " << problem.R << ", " << problem.S << ", " << problem.C << ")" << std::endl
+      << "NPQK: (" << problem.N << ", " << problem.P << ", " << problem.Q << ", " << problem.K << ")" << std::endl
+      << "Pad_h, Pad_w: (" << problem.pad_h << ", " << problem.pad_w << ")" << std::endl
+      << "Stride_h, Stride_w: (" << problem.stride_h << ", " << problem.stride_w << ")" << std::endl
+      << "Dilation_h, Dilation_w: (" << problem.dilation_h << ", " << problem.dilation_w << ")" << std::endl
+      << "split_k_slices: (" << problem.split_k_slices << ")" << std::endl
+      << "mode: (" << ((problem.mode==conv::Mode::kConvolution) ? "conv" : "xcross") << ")";
+
+  return out;
+}
+
+
+/// Default printing to ostream for Conv3dProblemSize
+inline
+std::ostream& operator<<(std::ostream& out, Conv3dProblemSize const& problem) {
+  out << "NDHWC: (" << problem.N << ", " << problem.D << ", " << problem.H << ", " << problem.W << ", " << problem.C << ")" << std::endl
+      << "KTRSC: (" << problem.K << ", " << problem.T << ", " << problem.R << ", " << problem.S << ", " << problem.C << ")" << std::endl
+      << "NZPQK: (" << problem.N << ", " << problem.Z << ", " << problem.P << ", " << problem.Q << ", " << problem.K << ")" << std::endl
+      << "pad_d, pad_h, pad_w: ("  << problem.pad_d << ", " << problem.pad_h << ", " << problem.pad_w << ")" << std::endl
+      << "stride_d, stride_h, stride_w: ("  << problem.stride_d << ", " << problem.stride_h << ", " << problem.stride_w << ")" << std::endl
+      << "dilation_d, dilation_h, dilation_w: ("  << problem.dilation_d << ", " << problem.dilation_h << ", " << problem.dilation_w << ")" << std::endl
+      << "split_k_slices: (" << problem.split_k_slices << ") " << std::endl
+      << "mode: (" << ((problem.mode==conv::Mode::kConvolution) ? "conv" : "xcross") << ")";
+
+  return out;
+}
+
+} // namespace conv
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace cutlass
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/thread/linear_combination.h b/include/cutlass/epilogue/thread/linear_combination.h
index 31c91643c1..4fff764fe5 100644
--- a/include/cutlass/epilogue/thread/linear_combination.h
+++ b/include/cutlass/epilogue/thread/linear_combination.h
@@ -145,7 +145,7 @@ class LinearCombination {
 
   /// Functionally required for serial reduction in the epilogue
   CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition) {
+  void set_k_partition(int k_partition, int k_partition_count) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h
index d1a0e6d908..62a6ea7872 100644
--- a/include/cutlass/epilogue/thread/linear_combination_clamp.h
+++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h
@@ -133,7 +133,7 @@ class LinearCombinationClamp {
 
   /// Functionally required for serial reduction in the epilogue
   CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition) {
+  void set_k_partition(int k_partition, int k_partition_count) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
@@ -319,7 +319,7 @@ class LinearCombinationClamp<ElementOutput_, Count, int, float, Round> {
 
   /// Functionally required for serial reduction in the epilogue
   CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition) {
+  void set_k_partition(int k_partition, int k_partition_count) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
@@ -354,7 +354,7 @@ class LinearCombinationClamp<ElementOutput_, Count, int, float, Round> {
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kCount; ++i) {
-      scaled_accumulator[i] = static_cast<int>(intermediate[i]);
+      scaled_accumulator[i] = __float2int_rn(intermediate[i]);
     }
 
     // Convert to destination numeric type
@@ -385,7 +385,7 @@ class LinearCombinationClamp<ElementOutput_, Count, int, float, Round> {
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kCount; ++i) {
-      scaled_accumulator[i] = static_cast<int>(intermediate[i]);
+      scaled_accumulator[i] = __float2int_rn(intermediate[i]);
     }
 
     // Convert to destination numeric type
@@ -495,7 +495,7 @@ class FastLinearCombinationClamp {
 
   /// Functionally required for serial reduction in the epilogue
   CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition) {
+  void set_k_partition(int k_partition, int k_partition_count) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
diff --git a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
index 3934af1041..68f334bdb8 100644
--- a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
+++ b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
@@ -134,7 +134,7 @@ class LinearCombinationPlanarComplex {
 
   /// Functionally required for serial reduction in the epilogue
   CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition) {
+  void set_k_partition(int k_partition, int k_partition_count) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
diff --git a/include/cutlass/epilogue/thread/linear_combination_relu.h b/include/cutlass/epilogue/thread/linear_combination_relu.h
index 7a2fa9e8af..7a41404791 100644
--- a/include/cutlass/epilogue/thread/linear_combination_relu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_relu.h
@@ -28,6 +28,7 @@
 
 #pragma once
 
+#include <cutlass/half.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
@@ -77,7 +78,6 @@ class LinearCombinationRelu {
     ElementCompute threshold;              ///< minimum value that is output 
     ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
     ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    ElementCompute const *threshold_ptr;   ///< pointer to threshold scalar - if not null, loads from memory
     //
     // Methods
     //
@@ -88,15 +88,14 @@ class LinearCombinationRelu {
       beta(ElementCompute(0)),
       threshold(ElementCompute(0)), 
       alpha_ptr(nullptr), 
-      beta_ptr(nullptr),
-      threshold_ptr(nullptr) { }
+      beta_ptr(nullptr) { }
 
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
       ElementCompute beta,
       ElementCompute threshold = ElementCompute(0)
-    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr), threshold_ptr(nullptr) {
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
 
     }
 
@@ -104,8 +103,8 @@ class LinearCombinationRelu {
     Params(
       ElementCompute const *alpha_ptr,
       ElementCompute const *beta_ptr,
-      ElementCompute const *threshold_ptr = nullptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold_ptr(threshold_ptr) {
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
 
     }
   };
@@ -128,7 +127,7 @@ class LinearCombinationRelu {
 
     alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
     beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    threshold_ = (params.threshold_ptr ? *params.threshold_ptr : params.threshold);
+    threshold_ = params.threshold;
   }
 
   /// Returns true if source is needed
@@ -139,10 +138,16 @@ class LinearCombinationRelu {
 
   /// Functionally required for serial reduction in the epilogue
   CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition) {
+  void set_k_partition(int k_partition, int k_partition_count) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+    }
   }
   
   /// Computes linear scaling: D = alpha * accumulator + beta * source
@@ -205,7 +210,6 @@ class LinearCombinationRelu {
   }
 };
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Conditional guards to enable partial specialization for packed integers
@@ -245,7 +249,6 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
     ElementCompute threshold;              ///< minimum value that is output 
     ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
     ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    ElementCompute const *threshold_ptr;   ///< pointer to threshold scalar - if not null, loads from memory
     //
     // Methods
     //
@@ -256,15 +259,14 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
       beta(ElementCompute(0)),
       threshold(ElementCompute(0)), 
       alpha_ptr(nullptr), 
-      beta_ptr(nullptr),
-      threshold_ptr(nullptr) { }
+      beta_ptr(nullptr) { }
 
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
       ElementCompute beta,
       ElementCompute threshold = ElementCompute(0)
-    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr), threshold_ptr(nullptr) {
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
 
     }
 
@@ -272,8 +274,8 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
     Params(
       ElementCompute const *alpha_ptr,
       ElementCompute const *beta_ptr,
-      ElementCompute const *threshold_ptr = nullptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold_ptr(threshold_ptr) {
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
 
     }
   };
@@ -296,7 +298,7 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
 
     alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
     beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    threshold_ = (params.threshold_ptr ? *params.threshold_ptr : params.threshold);
+    threshold_ = params.threshold;
   }
 
   /// Returns true if source is needed
@@ -307,10 +309,16 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
 
   /// Functionally required for serial reduction in the epilogue
   CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition) {
+  void set_k_partition(int k_partition, int k_partition_count) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+    }
   }
   
   /// Computes linear scaling: D = alpha * accumulator + beta * source
@@ -331,26 +339,41 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
 
     multiplies<ComputeFragment> mul_add_source;
     multiply_add<ComputeFragment> mul_add_accumulator;
-    ReLu<FragmentAccumulator> relu;
+    ReLu<ComputeFragment> relu;
 
     intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
     intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
 
-    // Convert floats back to INT
-    FragmentAccumulator scaled_accumulator;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kCount; ++i) {
-      scaled_accumulator[i] = static_cast<int>(intermediate[i]);
-    }
-
     // Compute threshold optionally
-    scaled_accumulator = relu(threshold_, scaled_accumulator);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, int, kCount, Round> destination_converter;
+    intermediate = relu(threshold_, intermediate);
 
-    return destination_converter(scaled_accumulator);
+    if (platform::is_same<ElementOutput, int32_t>::value ||
+        platform::is_same<ElementOutput, uint32_t>::value ||
+        platform::is_same<ElementOutput, int16_t>::value ||
+        platform::is_same<ElementOutput, uint16_t>::value ||
+        platform::is_same<ElementOutput, int8_t>::value ||
+        platform::is_same<ElementOutput, uint8_t>::value ||
+        platform::is_same<ElementOutput, cutlass::int4b_t>::value ||
+        platform::is_same<ElementOutput, cutlass::uint4b_t>::value ||
+        platform::is_same<ElementOutput, cutlass::uint1b_t>::value) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        scaled_accumulator[i] = __float2int_rn(intermediate[i]);
+      }
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
   }
 
   /// Computes linear scaling: D = alpha * accumulator
@@ -367,25 +390,48 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
     ComputeFragment intermediate;
 
     multiplies<ComputeFragment> mul_accumulator;
-    ReLu<FragmentAccumulator> relu;
+    ReLu<ComputeFragment> relu;
 
     intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
 
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
     // Convert floats back to INT
     FragmentAccumulator scaled_accumulator;
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kCount; ++i) {
-      scaled_accumulator[i] = static_cast<int>(intermediate[i]);
+      scaled_accumulator[i] = __float2int_rn(intermediate[i]);
     }
 
-    // Compute threshold optionally
-    scaled_accumulator = relu(threshold_, scaled_accumulator);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, int, kCount, Round> destination_converter;
-
-    return destination_converter(scaled_accumulator);
+    if (platform::is_same<ElementOutput, int32_t>::value ||
+        platform::is_same<ElementOutput, uint32_t>::value ||
+        platform::is_same<ElementOutput, int16_t>::value ||
+        platform::is_same<ElementOutput, uint16_t>::value ||
+        platform::is_same<ElementOutput, int8_t>::value ||
+        platform::is_same<ElementOutput, uint8_t>::value ||
+        platform::is_same<ElementOutput, cutlass::int4b_t>::value ||
+        platform::is_same<ElementOutput, cutlass::uint4b_t>::value ||
+        platform::is_same<ElementOutput, cutlass::uint1b_t>::value) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        scaled_accumulator[i] = __float2int_rn(intermediate[i]);
+      }
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
   }
 };
 
@@ -398,4 +444,3 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
index 3a65c49acf..dbefd2258c 100644
--- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
+++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -133,7 +133,7 @@ class LinearCombinationSigmoid {
 
   /// Functionally required for serial reduction in the epilogue
   CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition) {
+  void set_k_partition(int k_partition, int k_partition_count) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
index 8390ee0b47..08b829be1d 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@@ -367,6 +367,52 @@ struct DefaultInterleavedEpilogueTensorOp {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool IsBetaZero = false, bool isSplitK = false>
+struct DefaultInterleavedConvEpilogue {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedConvThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedConvPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          // can reuse the gemm version here to do element selection
+          layout::ColumnMajorInterleaved<InterleavedK>>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK, IsBetaZero>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
index 96e4335cab..752b1ee9b4 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
@@ -144,6 +144,55 @@ struct DefaultInterleavedThreadMapTensorOp {
       Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
 };
 
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
+          typename Element_, int ElementsPerAccess, int InterleavedK>
+struct DefaultInterleavedConvThreadMapTensorOp {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kInterleavedK = InterleavedK;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+    /// Tensor Operations fundamentally perform operations on 8 rows
+    static int const kTensorOpRows = 8;
+    static int const kWarpSize = 32;
+
+    static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
+                      !(ThreadblockShape::kN % WarpShape::kN),
+                  "Divisibility");
+
+    /// Number of warps
+    using WarpCount =
+        gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
+                        ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+
+  //
+  // ThreadMap
+  //
+
+  /// ThreadMap to be used by epilogue::MaskedTileIterator satisfying concept
+  /// InterleavedOutputTileThreadMap
+  using Type = InterleavedConvOutputTileThreadMap<
+      MatrixShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
+      MatrixShape<WarpShape::kM / Detail::kTensorOpRows,
+                  WarpShape::kN / InterleavedK>,
+      Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 
 } // namespace threadblock
diff --git a/include/cutlass/epilogue/threadblock/output_iterator_parameter.h b/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
new file mode 100644
index 0000000000..8cfba768c1
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/tensor_ref.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template<
+  typename TensorLayout_,                             ///! The original output tensor layout
+  typename OutputIteratorLayout_,                     ///! Layout used by epilogue output iterator
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  conv::Operator ConvOperator,                        ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter {
+
+  using TensorLayout = TensorLayout_;
+  using OutputIteratorLayout = OutputIteratorLayout_;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = ConvOperator;
+  using ConvProblemSize = ConvProblemSize_;
+
+  /// Wgrad stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix (KxRSC) 
+  // Conv3d row-major matrix (KxTRSC)
+  static int const kWgradStrideIdx = 
+    platform::is_same<TensorLayout, layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradStrideIdx : 0);
+
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride(kTensorStrideIdx);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static OutputTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+
+};
+
+
+
+template <
+  int InterleavedK,
+  typename TensorRef_,
+  conv::Operator ConvOperator,
+  typename ConvProblemSize_
+>
+struct ConvOutputIteratorParameter<
+  layout::TensorNCxHWx<InterleavedK>, 
+  layout::TensorNCxHWx<InterleavedK>,
+  TensorRef_,
+  ConvOperator,
+  ConvProblemSize_>
+{ 
+
+  using TensorLayout = typename layout::TensorNCxHWx<InterleavedK>;
+  using OutputIteratorLayout = typename layout::TensorNCxHWx<InterleavedK>;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = ConvOperator;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static OutputTensorCoord extent(ConvProblemSize problem_size) {
+    return problem_size.output_extent();
+  }
+
+};
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
diff --git a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
index 4eb5e3784b..cfe13cc167 100644
--- a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
+++ b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
@@ -488,6 +488,68 @@ struct InterleavedOutputTileThreadMap {
   }
 };
 
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template metaprogram for partitioning a 4D interleaved layout across warps
+/// to achieve several performance objectives:
+///
+///   - coalesced memory accesses in units of 64 Byte lines
+///   - minimal address arithmetic
+///   - minimal predicate calculations
+///
+template <typename WarpCount_, typename Iterations_, int Threads,
+          int ElementsPerAccess, int ElementSize>
+struct InterleavedConvOutputTileThreadMap {
+  using WarpCount = WarpCount_;
+
+  static int const kWarpSize = 32;
+  static int const kThreads = Threads;
+  static int const kWarpCount = kThreads / kWarpSize;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  //
+  // Metaprogram computation
+  //
+
+  struct Detail {};
+
+  //
+  // Output
+  //
+
+  using Iterations = Iterations_;
+
+  using Delta = MatrixShape<kWarpSize / 4, 4 * kElementsPerAccess>;
+
+  /// Initial offset function
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord initial_offset(int thread_idx) {
+    int warp_idx = thread_idx / kWarpSize;
+    int lane_idx = thread_idx % kWarpSize;
+
+    // Compute warp location
+    MatrixCoord warp_footprint{
+        Delta::kRow * Iterations::kRow,
+        Delta::kColumn * Iterations::kColumn,
+    };
+
+    MatrixCoord warp_offset{warp_idx % WarpCount::kRow,
+                            warp_idx / WarpCount::kRow};
+
+    // Compute per-lane offset
+    MatrixCoord thread_offset_in_warp{lane_idx / 4,
+                                      (lane_idx % 4) * kElementsPerAccess};
+
+    MatrixCoord thread_offset_in_threadblock_tile =
+        warp_footprint * warp_offset + thread_offset_in_warp;
+
+    return thread_offset_in_threadblock_tile;
+  }
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 
 } // namespace threadblock
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
index 05af759a5e..1be50cbd90 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
@@ -43,6 +43,7 @@
 #include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
 #include "cutlass/arch/arch.h"
 #include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -102,68 +103,20 @@ class PredicatedTileIterator {
   // Parameters struct
   //
 
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    LongIndex stride;               ///< stride in bytes between rows
-
-    LongIndex increment_row;        ///< increment quantity (in bytes) to advance when moving between rows
-    LongIndex increment_group;      ///< increment quantity (in bytes) to advance when moving to the next group
-    LongIndex increment_cluster;    ///< increment quantity (in bytes) to advance when moving to the next cluster
-
-    LongIndex advance_row;          ///< amount to add to move to the next 'row' position
-    LongIndex advance_group;        ///< amount to add to move to the next 'group' position
-    LongIndex advance_cluster;      ///< amount to add to move to the next 'cluster' position
-    LongIndex advance_tile;         ///< amount to add to move to the next 'tile'
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Status initialize(Index stride_) {
-      
-      stride = LongIndex(stride_);
-
-      increment_row = stride * ThreadMap::Delta::kRow;
-
-      increment_group = stride * ThreadMap::Delta::kGroup
-        - stride * ThreadMap::Delta::kRow * (ThreadMap::Iterations::kRow - 1);
-
-      increment_cluster = stride * ThreadMap::Delta::kCluster
-        - stride * ThreadMap::Delta::kGroup * (ThreadMap::Iterations::kGroup - 1)
-        - stride * ThreadMap::Delta::kRow * (ThreadMap::Iterations::kRow - 1);
-
-      advance_row = stride * ThreadMap::Shape::kRow;
-
-      advance_group = stride * (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
-      
-      advance_cluster = 
-        stride * 
-        ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;;
-      
-      advance_tile = 
-        stride * 
-        ThreadMap::Shape::kGroup * 
-        ThreadMap::Shape::kRow * 
-        ThreadMap::Shape::kCluster * 
-        ThreadMap::Shape::kTile;
-
-      return Status::kSuccess;
-    }
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
 
     CUTLASS_HOST_DEVICE
-    Params() {
-      initialize(0);
-    }
+    Params() { }
 
     CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) {
-
-      initialize(layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess);
+    Params(Layout const &layout): 
+      PredicatedTileIteratorParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) 
+    {
+        
     }
   };
 
@@ -207,7 +160,7 @@ class PredicatedTileIterator {
   //
 
   /// Parameters structure containing reference and precomputed state.
-  Params params_;
+  PredicatedTileIteratorParams params_;
 
   /// Byte-level pointer
   uint8_t *byte_pointer_;
@@ -239,12 +192,13 @@ class PredicatedTileIterator {
   /// Constructor
   CUTLASS_DEVICE
   PredicatedTileIterator(
-    Params const & params,
+    PredicatedTileIteratorParams const & params,
     Element *pointer,
     TensorCoord extent,
     int thread_idx,
     TensorCoord threadblock_offset = TensorCoord()
-  ): params_(params)
+  ): 
+    params_(params)
   {
 
     TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
@@ -745,6 +699,309 @@ class InterleavedPredicatedTileIterator {
 };
 
 ///////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | InterleavedMaskedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  int InterleavedN           ///< Number of Interleaved N
+>
+class InterleavedConvPredicatedTileIterator {
+public:
+  using ThreadMap = ThreadMap_;
+
+  using Element = Element_;
+
+  using Layout = layout::TensorNCxHWx<InterleavedN>;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = Tensor4DCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Iterations::kCount;
+
+  /// Fragment object
+  using Fragment = Array<Element, ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    LongIndex stride_col;           ///< stride in bytes between columns
+    LongIndex stride_row;           ///< stride in bytes between rows
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Status initialize(typename Layout::Stride stride_) {
+      stride_col = stride_[1];
+      stride_row = stride_[2];
+
+      return Status::kSuccess;
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params() {
+      initialize(cutlass::make_Coord(0, 0, 0));
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) {
+
+      initialize(layout.stride());
+    }
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount =
+        (ThreadMap::Iterations::kRow < 8) ? 8 : ThreadMap::Iterations::kRow;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in columns
+  Index extent_col_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in pq 
+  Index extent_pq_;
+
+  /// A thread's starting row position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_col_;
+
+  /// Internal iteration counter
+  LongIndex iteration_row_;
+  LongIndex iteration_col_;
+
+  uint32_t pq_mul_;
+
+  uint32_t pq_shr_;
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  InterleavedConvPredicatedTileIterator(
+    Params const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    MatrixCoord threadblock_offset
+  ):
+    params_(params) {
+    MatrixCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+                                
+    extent_col_ = extent.c();
+    extent_pq_ = extent.h() * extent.w();
+    extent_row_ = extent.n() * extent_pq_;
+
+    find_divisor(pq_mul_, pq_shr_, extent_pq_);
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_col_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int r = 0; r < ThreadMap::Iterations::kRow; ++r) {
+      mask_.predicates[r] =
+          ((thread_offset.row() + ThreadMap::Delta::kRow * r) < extent_row_);
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+                    ((thread_start_col_ / InterleavedN) * params_.stride_col +
+                     (thread_start_col_ % InterleavedN)) *
+                        sizeof_bits<Element>::value / 8;
+
+    // Initialize internal state counter
+    iteration_row_ = iteration_col_ = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    int col_offset = iteration_col_ * ThreadMap::Delta::kColumn;
+    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
+    bool guard = col_guard && mask_.predicates[iteration_row_];
+
+    int n, pq_rem;
+
+    fast_divmod(n, pq_rem,
+                thread_start_row_ + iteration_row_ * ThreadMap::Delta::kRow,
+                extent_pq_, pq_mul_, pq_shr_);
+
+    uint8_t *byte_pointer =
+        byte_pointer_ + (n * params_.stride_row + pq_rem * InterleavedN) *
+                            sizeof_bits<Element>::value / 8;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    AccessType const *memory_pointer =
+        reinterpret_cast<AccessType const *>(byte_pointer);
+
+    cutlass::arch::global_load<
+      AccessType, 
+      sizeof(AccessType)
+    >(
+        *frag_ptr,
+        (void *)memory_pointer,
+        guard);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+
+    int col_offset = iteration_col_ * ThreadMap::Delta::kColumn;
+    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
+    bool guard = col_guard && mask_.predicates[iteration_row_];
+
+    int n, pq_rem;
+
+    fast_divmod(n, pq_rem,
+                thread_start_row_ + iteration_row_ * ThreadMap::Delta::kRow,
+                extent_pq_, pq_mul_, pq_shr_);
+
+    uint8_t *byte_pointer =
+        byte_pointer_ + (n * params_.stride_row + pq_rem * InterleavedN) *
+                            sizeof_bits<Element>::value / 8;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);
+
+    if (guard) {
+      *memory_pointer = *frag_ptr;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int iteration) {
+    iteration_row_ = iteration % ThreadMap::Iterations::kRow;
+    iteration_col_ = iteration / ThreadMap::Iterations::kRow;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  InterleavedConvPredicatedTileIterator &operator++() {
+
+    ++iteration_row_;
+
+    if (iteration_row_ == ThreadMap::Iterations::kRow) {
+
+      iteration_row_ = 0;
+      ++iteration_col_;
+      byte_pointer_ += params_.stride_col;
+
+      if (iteration_col_ == ThreadMap::Iterations::kColumn) {
+        iteration_col_ = 0;
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    return mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
 ///////////////////////////////////////////////////////////////////////////////
 
 } // namespace threadblock
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
new file mode 100644
index 0000000000..a08e1e0616
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
@@ -0,0 +1,227 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct OutputTileShapeDesc {
+
+  int column;
+  int row;
+  int group;
+  int cluster;
+  int tile;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  OutputTileShapeDesc(): column(0), row(0), group(0), cluster(0), tile(0) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  OutputTileShapeDesc(
+    int column_,
+    int row_,
+    int group_,
+    int cluster_,
+    int tile_
+  ):
+    column(column_),
+    row(row_),
+    group(group_),
+    cluster(cluster_),
+    tile(tile_) { }
+
+  /// Total number of points in the 5D space
+  CUTLASS_HOST_DEVICE
+  int count() const {
+    return column * row * group * cluster * tile;
+  }
+};
+
+/// Helper template to construct an OutputTileShapeDesc from a OutputTileShape template.
+template <typename Shape>
+CUTLASS_HOST_DEVICE
+OutputTileShapeDesc make_OutputTileShapeDesc() {
+  return OutputTileShapeDesc(
+    Shape::kColumn,
+    Shape::kRow,
+    Shape::kGroup,
+    Shape::kCluster,
+    Shape::kTile
+  );
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Thread map description
+struct OutputTileThreadMapDesc {
+
+  int threads;
+  int elements_per_access;
+  OutputTileShapeDesc shape;
+  OutputTileShapeDesc iterations;
+  OutputTileShapeDesc delta;
+  OutputTileShapeDesc count;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  OutputTileThreadMapDesc() { }
+
+  CUTLASS_HOST_DEVICE
+  OutputTileThreadMapDesc(
+    int threads_,
+    int elements_per_access_,
+    OutputTileShapeDesc shape_,
+    OutputTileShapeDesc iterations_,
+    OutputTileShapeDesc delta_,
+    OutputTileShapeDesc count_
+  ):
+    threads(threads_), 
+    elements_per_access(elements_per_access_),
+    shape(shape_),
+    iterations(iterations_),
+    delta(delta_),
+    count(count_) { }
+};
+
+/// Helper template to construct an OutputTileShapeDesc from a OutputTileThreadMap template.
+template <typename ThreadMap>
+CUTLASS_HOST_DEVICE
+OutputTileThreadMapDesc make_OutputTileThreadMapDesc() {
+  return OutputTileThreadMapDesc(
+    ThreadMap::kThreads,
+    ThreadMap::kElementsPerAccess,
+    make_OutputTileShapeDesc<typename ThreadMap::Shape>(),
+    make_OutputTileShapeDesc<typename ThreadMap::Iterations>(),
+    make_OutputTileShapeDesc<typename ThreadMap::Delta>(),
+    make_OutputTileShapeDesc<typename ThreadMap::Count>()
+  );
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Parameters struct
+//
+
+struct PredicatedTileIteratorParams {
+
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  //
+  // Data members
+  //
+
+  LongIndex stride;               ///< stride in bytes between rows
+
+  LongIndex increment_row;        ///< increment quantity (in bytes) to advance when moving between rows
+  LongIndex increment_group;      ///< increment quantity (in bytes) to advance when moving to the next group
+  LongIndex increment_cluster;    ///< increment quantity (in bytes) to advance when moving to the next cluster
+
+  LongIndex advance_row;          ///< amount to add to move to the next 'row' position
+  LongIndex advance_group;        ///< amount to add to move to the next 'group' position
+  LongIndex advance_cluster;      ///< amount to add to move to the next 'cluster' position
+  LongIndex advance_tile;         ///< amount to add to move to the next 'tile'
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(Index stride_, OutputTileThreadMapDesc thread_map) {
+    
+    stride = LongIndex(stride_);
+
+    increment_row = stride * thread_map.delta.row;
+
+    increment_group = stride * thread_map.delta.group
+      - stride * thread_map.delta.row * (thread_map.iterations.row - 1);
+
+    increment_cluster = stride * thread_map.delta.cluster
+      - stride * thread_map.delta.group * (thread_map.iterations.group - 1)
+      - stride * thread_map.delta.row * (thread_map.iterations.row - 1);
+
+    advance_row = stride * thread_map.shape.row;
+
+    advance_group = 
+      stride * 
+      (thread_map.shape.group - 1) * thread_map.shape.row * thread_map.count.row;
+    
+    advance_cluster = 
+      stride * 
+      thread_map.count.group * 
+      thread_map.shape.group * 
+      thread_map.count.row * 
+      thread_map.shape.row;
+    
+    advance_tile =
+      stride * 
+      thread_map.shape.group * 
+      thread_map.shape.row * 
+      thread_map.shape.cluster * 
+      thread_map.shape.tile;
+
+    return Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorParams() {
+    initialize(0, OutputTileThreadMapDesc());
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorParams(Index stride, OutputTileThreadMapDesc thread_map) {
+
+    initialize(stride, thread_map);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
index 79106b111e..b2a0612ac5 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
@@ -37,7 +37,7 @@
 
 #pragma once
 
-#if !defined(__clang__)
+#if !(defined(__clang__) && defined(__CUDA__))
 
 #include "cutlass/wmma_array.h"
 #include "cutlass/layout/matrix.h"
@@ -152,5 +152,7 @@ class FragmentIteratorWmmaTensorOp<WarpShape_, OperatorShape_, OperatorElementC_
 
 ////////////////////////////////////////////////////////////////////////////////
 
+#else
+#error (defined(__clang__) && defined(__CUDA__))
 #endif // !defined(__clang__)
 
diff --git a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
index 6017b5c7ed..c8eab0ceb1 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
@@ -28,7 +28,7 @@
 
 #pragma once
 
-#if !defined(__clang__)
+#if !(defined(__clang__) && defined(__CUDA__))
 
 #include "cutlass/cutlass.h"
 #include "cutlass/wmma_array.h"
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
index 978e614b1d..4d9503e5f2 100644
--- a/include/cutlass/fast_math.h
+++ b/include/cutlass/fast_math.h
@@ -200,7 +200,7 @@ void fast_divmod(int& quo, int& rem, int src, int div, unsigned int mul, unsigne
   // Use IMUL.HI if div != 1, else simply copy the source.
   quo = (div != 1) ? __umulhi(src, mul) >> shr : src;
   #else
-  quo = int((div != 1) ? int(src * mul) >> shr : src);
+  quo = int((div != 1) ? int(((int64_t)src * mul) >> 32) >> shr : src);
   #endif
 
   // The remainder.
@@ -215,7 +215,7 @@ void fast_divmod(int& quo, int64_t& rem, int64_t src, int div, unsigned int mul,
   // Use IMUL.HI if div != 1, else simply copy the source.
   quo = (div != 1) ? __umulhi(src, mul) >> shr : src;
   #else
-  quo = int((div != 1) ? (src * mul) >> shr : src);
+  quo = int((div != 1) ? ((src * mul) >> 32) >> shr : src);
   #endif
   // The remainder.
   rem = src - (quo * div);
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 90cf394941..d20c45df2e 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -161,6 +161,42 @@ struct negate {
   }
 };
 
+/// Greater equal 
+template <typename T>
+struct greater_equal {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs >= rhs);
+  }
+};
+
+/// Greater  
+template <typename T>
+struct greater {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs > rhs);
+  }
+};
+
+/// Less equal 
+template <typename T>
+struct less_equal {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs <= rhs);
+  }
+};
+
+/// Less  
+template <typename T>
+struct less {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs < rhs);
+  }
+};
+
 /// Fused multiply-add
 template <typename A, typename B = A, typename C = A>
 struct multiply_add {
@@ -189,6 +225,40 @@ struct xor_add {
   }
 };
 
+template <typename T>
+struct conjugate {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a) const {
+    return a;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct conjugate<complex<T>>  {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(complex<T> const &a) const {
+    return conj(a);
+  }
+};
+
+template <typename T, int N>
+struct conjugate<Array<T, N> >  {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a) const {
+
+    conjugate<T> conj_op;
+
+    Array<T, N> ca;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      ca[i] = conj_op(a[i]);
+    }
+    return ca;
+  }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // Partial specialization for complex<T> to target four scalar fused multiply-adds.
@@ -1499,6 +1569,86 @@ struct multiply_add<Array<bfloat16_t, N>, Array<bfloat16_t, N>, Array<bfloat16_t
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator-(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  minus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator-(Array<T, N> const &lhs) {
+  negate<Array<T, N>> op;
+  return op(lhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(T lhs, Array<T, N> const &rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(Array<T, N> const &lhs, T rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator/(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  divides<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(T a, Array<T, N> const &b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, T b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, T c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm_sparse.h b/include/cutlass/gemm/device/gemm_sparse.h
index df2a141cd1..bfd5606e1f 100644
--- a/include/cutlass/gemm/device/gemm_sparse.h
+++ b/include/cutlass/gemm/device/gemm_sparse.h
@@ -429,6 +429,25 @@ class SparseGemm {
       args.epilogue,
       static_cast<int *>(workspace)
     };
+    
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+
+      result = cudaFuncSetAttribute(
+          Kernel<GemmKernel>,
+          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
 
     return Status::kSuccess;
   }
@@ -461,30 +480,11 @@ class SparseGemm {
     dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
     dim3 block(GemmKernel::kThreadCount, 1, 1);
 
-    cudaError_t result;
-
     int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-    if (smem_size >= (48 << 10)) {
-      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-
-      result = cudaFuncSetAttribute(
-          Kernel<GemmKernel>,
-          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
 
     cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
 
-    result = cudaGetLastError();
+    cudaError_t result = cudaGetLastError();
 
     return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
   }
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index 12a8a6d7f3..a669483541 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -117,9 +117,16 @@ class GemmUniversalAdapter {
   using ThreadblockShape = typename GemmKernel::Mma::Shape;
   using WarpShape = typename GemmKernel::WarpShape;
   using InstructionShape = typename GemmKernel::InstructionShape;
- 
-  using OperatorClass = typename GemmKernel::OperatorClass;
-  using ArchTag = typename GemmKernel::ArchTag;
+
+  // warp-level, arch-level (instruction), math operator 
+  using WarpMmaOperator = typename GemmKernel::Mma::Policy::Operator;
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  // Operator class and arch tag extract bottom-up 
+  // set it for top-level gemm device-level template
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
 
   // Type, layout, and complex transform deliberately exchanged with B
   using MapArguments = detail::MapArguments<
diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h
index fc52a08d0f..9ffc6b041c 100644
--- a/include/cutlass/gemm/device/gemm_universal_base.h
+++ b/include/cutlass/gemm/device/gemm_universal_base.h
@@ -311,6 +311,27 @@ class GemmUniversalBase {
       gemm_k_size,
       static_cast<int *>(workspace)
     );
+   
+    // Specify shared memory capacity for kernel. 
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+
+      result = cudaFuncSetAttribute(
+          Kernel<GemmKernel>,
+          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
 
     return Status::kSuccess;
   }
@@ -335,38 +356,31 @@ class GemmUniversalBase {
   Status run(cudaStream_t stream = nullptr) {
     CUTLASS_TRACE_HOST("GemmUniversalBase::run()");
 
+    //
+    // Configure grid and block dimensions
+    //
+
     ThreadblockSwizzle threadblock_swizzle;
 
     dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
     dim3 block(GemmKernel::kThreadCount, 1, 1);
 
-    cudaError_t result;
-
     int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-    if (smem_size >= (48 << 10)) {
-      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
 
-      result = cudaFuncSetAttribute(
-          Kernel<GemmKernel>,
-          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
+    //
+    // Launch kernel
+    //
 
     CUTLASS_TRACE_HOST("  grid: (" << grid << "),  block: (" << block 
       << "),  SMEM: " << smem_size << " bytes");
 
+    // Launch
     cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
 
-    result = cudaGetLastError();
+    //
+    // Query for errors
+    //
+    cudaError_t result = cudaGetLastError();
 
     if (result != cudaSuccess) {
       CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
diff --git a/include/cutlass/gemm/kernel/default_gemm_complex.h b/include/cutlass/gemm/kernel/default_gemm_complex.h
index 15b1430c79..cff06e69de 100644
--- a/include/cutlass/gemm/kernel/default_gemm_complex.h
+++ b/include/cutlass/gemm/kernel/default_gemm_complex.h
@@ -49,6 +49,7 @@
 #include "cutlass/gemm/kernel/gemm_pipelined.h"
 #include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
 #include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
 #include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
 #include "cutlass/gemm/threadblock/default_mma.h"
 #include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
@@ -112,6 +113,101 @@ struct DefaultGemmComplex;
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassSimt,
+  arch::Sm50, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape,
+    WarpShape, 
+    InstructionShape, 
+    ElementA, LayoutA, 
+    ElementB, LayoutB, 
+    ElementAccumulator, layout::RowMajor, 
+    arch::OpClassSimt,
+    Stages,
+    Operator,
+    false,
+    cutlass::arch::CacheOperation::Global,
+    cutlass::arch::CacheOperation::Global,
+    TransformA, 
+    TransformB
+  >;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, 
+          typename MmaCore::IteratorThreadMapA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, 
+          typename MmaCore::IteratorThreadMapB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using Mma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+
+  /// Define the epilogue
+  using Epilogue =
+    typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+        ThreadblockShape, 
+        typename Mma::Operator, 
+        EpilogueOutputOp,
+        EpilogueOutputOp::kCount
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Partial specialization for Ampere Architecture
 template <
     /// Element type for A matrix operand
@@ -170,6 +266,70 @@ struct DefaultGemmComplex<
   using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
 };
 
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassSimt,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
+      layout::RowMajor, arch::OpClassSimt, arch::Sm80, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+    typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+        ThreadblockShape, 
+        typename Mma::Operator, 
+        EpilogueOutputOp,
+        EpilogueOutputOp::kCount
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace kernel
diff --git a/include/cutlass/gemm/kernel/gemm.h b/include/cutlass/gemm/kernel/gemm.h
index fc2daa9759..ce61137f36 100644
--- a/include/cutlass/gemm/kernel/gemm.h
+++ b/include/cutlass/gemm/kernel/gemm.h
@@ -138,8 +138,20 @@ struct Gemm {
       typename Epilogue::OutputTileIterator::TensorRef ref_C,
       typename Epilogue::OutputTileIterator::TensorRef ref_D) {
 
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
     static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
 
     if (!TensorRef_aligned(ref_A, kAlignmentA)) {
@@ -274,7 +286,7 @@ struct Gemm {
       semaphore.fetch();
 
       // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_offset.k());
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
     }
 
     // Tile iterator loading from source tensor.
diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex.h b/include/cutlass/gemm/kernel/gemm_planar_complex.h
index aede20dae5..b9626145fe 100644
--- a/include/cutlass/gemm/kernel/gemm_planar_complex.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h
@@ -582,7 +582,7 @@ struct GemmPlanarComplex {
         semaphore.fetch();
 
         // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k());
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
       }
     }
     else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h
index 99ece26742..bba6217160 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.h
+++ b/include/cutlass/gemm/kernel/gemm_universal.h
@@ -302,8 +302,20 @@ struct GemmUniversal {
 
     CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
 
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
     static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
 
     if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
@@ -468,7 +480,7 @@ struct GemmUniversal {
         semaphore.fetch();
 
         // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k());
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
       }
     }
     else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
diff --git a/include/cutlass/gemm/kernel/sparse_gemm.h b/include/cutlass/gemm/kernel/sparse_gemm.h
index 7db469e534..730745fdc8 100644
--- a/include/cutlass/gemm/kernel/sparse_gemm.h
+++ b/include/cutlass/gemm/kernel/sparse_gemm.h
@@ -319,7 +319,7 @@ struct SparseGemm {
       semaphore.fetch();
 
       // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_offset.k());
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
     }
 
     // Tile iterator loading from source tensor.
diff --git a/include/cutlass/gemm/thread/mma_sm60.h b/include/cutlass/gemm/thread/mma_sm60.h
index 486497cb79..07e2d55629 100644
--- a/include/cutlass/gemm/thread/mma_sm60.h
+++ b/include/cutlass/gemm/thread/mma_sm60.h
@@ -93,6 +93,9 @@ struct Mma_HFMA2 <
   /// C operand storage
   using FragmentC = Array<half_t, Shape::kMN>;
 
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
   //
   // Methods
   //
@@ -179,6 +182,9 @@ struct Mma_HFMA2<
   /// C operand storage
   using FragmentC = Array<half_t, Shape::kMN>;
 
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
   //
   // Methods
   //
@@ -270,6 +276,9 @@ struct Mma_HFMA2 <
   /// C operand storage
   using FragmentC = Array<half_t, Shape::kMN>;
 
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
   //
   // Methods
   //
@@ -356,6 +365,8 @@ struct Mma_HFMA2<
   /// C operand storage
   using FragmentC = Array<half_t, Shape::kMN>;
 
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
   //
   // Methods
   //
@@ -443,6 +454,9 @@ struct Mma_HFMA2 <
   /// C operand storage
   using FragmentC = Array<half_t, Shape::kMN>;
 
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
   //
   // Methods
   //
@@ -533,6 +547,9 @@ struct Mma_HFMA2 <
   /// C operand storage
   using FragmentC = Array<half_t, Shape::kMN>;
 
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
   //
   // Methods
   //
@@ -623,6 +640,9 @@ struct Mma_HFMA2 <
   /// C operand storage
   using FragmentC = Array<half_t, Shape::kMN>;
 
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
   //
   // Methods
   //
@@ -714,6 +734,9 @@ struct Mma_HFMA2<
   /// C operand storage
   using FragmentC = Array<half_t, Shape::kMN>;
 
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
   //
   // Methods
   //
@@ -800,6 +823,9 @@ struct Mma_HFMA2<
   /// C operand storage
   using FragmentC = Array<half_t, Shape::kMN>;
 
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
   //
   // Methods
   //
@@ -879,6 +905,9 @@ struct Mma_HFMA2<
   /// C operand storage
   using FragmentC = Array<half_t, Shape::kMN>;
 
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+  
   //
   // Methods
   //
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
index be50149372..ba3a161650 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_simt.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
@@ -389,7 +389,7 @@ struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
   /// Policy used to define MmaPipelined 
   using MmaPolicy = MmaPolicy<
     MmaWarpSimt,
-    MatrixShape<kPaddingN, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
     MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
     WarpCount::kK
   >;
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
index 7f3d534a1f..36c5c54ee9 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
@@ -34,6 +34,7 @@
 #include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
index 230e8d7681..697d22bf6d 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
@@ -1105,6 +1105,676 @@ struct DefaultMultistageMmaComplexCore<
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for complex double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::ColumnMajor,
+    complex<RealB>, layout::ColumnMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator B 
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for complex double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::ColumnMajor,
+    complex<RealB>, layout::RowMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,    // or Shape::kK / 32
+    WarpCount::kK>;
+};
+
+/// Partial specialization for complex double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::RowMajor,
+    complex<RealB>, layout::ColumnMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator B 
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for complex double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::RowMajor,
+    complex<RealB>, layout::RowMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, 0>,    // or Shape::kK / 32
+    WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 
 }  // namespace threadblock
 }  // namespace gemm
diff --git a/include/cutlass/gemm/threadblock/mma_multistage.h b/include/cutlass/gemm/threadblock/mma_multistage.h
index 0431c3060f..804e3373a3 100644
--- a/include/cutlass/gemm/threadblock/mma_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_multistage.h
@@ -228,7 +228,7 @@ class MmaMultistage :
         for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
           auto gmem_ptr = iterator_A.get();
 
-          cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
               dst_ptr + v, gmem_ptr, iterator_A.valid());
 
           ++iterator_A;
@@ -258,7 +258,7 @@ class MmaMultistage :
         for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
           auto gmem_ptr = iterator_B.get();
 
-          cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
               dst_ptr + v, gmem_ptr, iterator_B.valid());
 
           ++iterator_B;
@@ -513,6 +513,11 @@ class MmaMultistage :
       }
 
     }
+    
+    // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
 
   }
 };
diff --git a/include/cutlass/gemm/threadblock/mma_singlestage.h b/include/cutlass/gemm/threadblock/mma_singlestage.h
index 32d4d4ee60..373d985ac6 100644
--- a/include/cutlass/gemm/threadblock/mma_singlestage.h
+++ b/include/cutlass/gemm/threadblock/mma_singlestage.h
@@ -105,6 +105,14 @@ class MmaSingleStage : public MmaBase<Shape_, Policy_, 1> {
   /// Warp-level Mma
   using Operator = typename Policy::Operator;
 
+  using ArchTag = arch::Sm70;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
   // staticaly assert kStages for MmaSingleStage is 1 (single stage mma pipeline)
   static_assert((Base::kStages==1), "MmaSingleStage requires kStages set to value 1");
 private:
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
index 2dc72fd333..a34c16df07 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
@@ -314,8 +314,17 @@ class MmaComplexTensorOp<
   /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp)
   using Policy = Policy_;
 
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
   /// Shape of underlying instruction
-  using InstructionShape = typename Policy::Operator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
 
   /// Complex transform on A operand
   static ComplexTransform const kTransformA = TransformA;
@@ -323,9 +332,6 @@ class MmaComplexTensorOp<
   /// Complex transform on B operand
   static ComplexTransform const kTransformB = TransformB;
 
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
   /// Number of threads participating in warp-level matrix product
   static int const kThreadCount = 32;
 
@@ -337,7 +343,7 @@ class MmaComplexTensorOp<
     Operand::kA,
     ElementA,
     LayoutA,
-    MatrixShape<Policy::Operator::Shape::kM, Policy::Operator::Shape::kK>,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
     Policy::OpDelta::kRow,
     32,
     1
@@ -355,7 +361,7 @@ class MmaComplexTensorOp<
     Operand::kB,
     ElementB,
     LayoutB,
-    MatrixShape<Policy::Operator::Shape::kK, Policy::Operator::Shape::kN>,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
     Policy::OpDelta::kColumn,
     32,
     1
@@ -368,14 +374,14 @@ class MmaComplexTensorOp<
   using TransformedFragmentB = FragmentB;
 
   static_assert(
-    !(Shape::kM % Policy::Operator::Shape::kM) && 
-    !(Shape::kN % Policy::Operator::Shape::kN),
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
     "Shape of warp-level Mma must be divisible by operator shape.");
 
   /// Number of mma operations performed
   using MmaIterations = MatrixShape<
-    Shape::kM / Policy::Operator::Shape::kM,
-    Shape::kN / Policy::Operator::Shape::kN
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
   >;
 
   /// Iterates over the C operand in memory
@@ -383,7 +389,7 @@ class MmaComplexTensorOp<
      MatrixShape<Shape::kM, Shape::kN>, 
      ElementC, 
      LayoutC,
-     typename Policy::Operator::Shape, 
+     typename ArchMmaOperator::Shape, 
      typename Policy::OpDelta>;
 
   /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
@@ -393,7 +399,7 @@ class MmaComplexTensorOp<
   using FragmentC = typename IteratorC::Fragment;
 
   static_assert(
-    FragmentC::kElements == 2 * MmaIterations::kCount * Policy::Operator::FragmentC::kElements,
+    FragmentC::kElements == 2 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
     "Unexpected planar complex fragment length.");
 
 private:
@@ -403,7 +409,7 @@ class MmaComplexTensorOp<
   //
 
   /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
-  typename Policy::Operator mma;
+  ArchMmaOperator mma;
 
 public:
 
@@ -425,9 +431,9 @@ class MmaComplexTensorOp<
   ) const {
 
     // Alias types for underlying real-valued matrix multiply operator
-    using MmaOperandA = typename Policy::Operator::FragmentA;
-    using MmaOperandB = typename Policy::Operator::FragmentB;
-    using MmaOperandC = typename Policy::Operator::FragmentC;
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
 
     static_assert(MmaOperandA::kElements == 1, 
       "This implementation only supports math instructions in which exactly one element is needed for the A operand."
@@ -599,12 +605,18 @@ class MmaComplexTensorOp<
 
   /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
   using Policy = Policy_;
-  
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
   /// Shape of underlying instruction
-  using InstructionShape = typename Policy::Operator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
 
   /// Underlying arch tag
-  using ArchTag = typename Policy::Operator::ArchTag;
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
 
   /// Complex transform on A operand
   static ComplexTransform const kTransformA = TransformA;
@@ -612,9 +624,6 @@ class MmaComplexTensorOp<
   /// Complex transform on B operand
   static ComplexTransform const kTransformB = TransformB;
 
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
   /// Number of threads participating in warp-level matrix product
   static int const kThreadCount = 32;
 
@@ -626,7 +635,7 @@ class MmaComplexTensorOp<
     Operand::kA,
     ElementA,
     LayoutA,
-    MatrixShape<Policy::Operator::Shape::kM, Policy::Operator::Shape::kK>,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
     Policy::OpDelta::kRow,
     32,
     1
@@ -637,7 +646,7 @@ class MmaComplexTensorOp<
 
   /// Storage for transformed A tile
   using TransformedFragmentA =
-      Array<typename Policy::Operator::ElementA, FragmentA::kElements * 2>;
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements * 2>;
 
   /// Iterates over the B operand in memory
   using IteratorB = MmaTensorOpMultiplicandTileIterator<
@@ -645,7 +654,7 @@ class MmaComplexTensorOp<
     Operand::kB,
     ElementB,
     LayoutB,
-    MatrixShape<Policy::Operator::Shape::kK, Policy::Operator::Shape::kN>,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
     Policy::OpDelta::kColumn,
     32,
     1
@@ -656,17 +665,17 @@ class MmaComplexTensorOp<
 
   /// Storage for transformed B tile
   using TransformedFragmentB =
-      Array<typename Policy::Operator::ElementB, FragmentB::kElements * 2>;
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements * 2>;
 
   static_assert(
-    !(Shape::kM % Policy::Operator::Shape::kM) && 
-    !(Shape::kN % Policy::Operator::Shape::kN),
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
     "Shape of warp-level Mma must be divisible by operator shape.");
 
   /// Number of complex products operations performed (one complex product needs four mma instructions)
   using MmaIterations = MatrixShape<
-    Shape::kM / Policy::Operator::Shape::kM,
-    Shape::kN / Policy::Operator::Shape::kN
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
   >;
 
   /// Iterates over the C operand in memory
@@ -674,7 +683,7 @@ class MmaComplexTensorOp<
      MatrixShape<Shape::kM, Shape::kN>, 
      ElementC, 
      LayoutC,
-     typename Policy::Operator::Shape, 
+     typename ArchMmaOperator::Shape, 
      typename Policy::OpDelta>;
 
   /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
@@ -690,7 +699,7 @@ class MmaComplexTensorOp<
   //
 
   /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
-  typename Policy::Operator mma;
+  ArchMmaOperator mma;
 
 public:
 
@@ -712,11 +721,11 @@ class MmaComplexTensorOp<
   ) const {
 
     // Alias types for underlying real-valued matrix multiply operator
-    using InstMmaOperandA = typename Policy::Operator::FragmentA;
-    using InstMmaOperandB = typename Policy::Operator::FragmentB;
-    using MmaOperandC = typename Policy::Operator::FragmentC;
+    using InstMmaOperandA = typename ArchMmaOperator::FragmentA;
+    using InstMmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
 
-    static_assert(platform::is_same<cutlass::gemm::GemmShape<16, 8, 8>, typename Policy::Operator::Shape>::value, 
+    static_assert(platform::is_same<cutlass::gemm::GemmShape<16, 8, 8>, typename ArchMmaOperator::Shape>::value, 
       "This implementation only supports MMA.1688 math instructions.");
 
     static_assert(InstMmaOperandA::kElements == 4, 
@@ -794,8 +803,8 @@ class MmaComplexTensorOp<
   void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
                  FragmentA const &A, FragmentB const &B) const {
     // Alias types for underlying real-valued matrix multiply operator
-    using InstMmaOperandA = typename Policy::Operator::FragmentA;
-    using InstMmaOperandB = typename Policy::Operator::FragmentB;
+    using InstMmaOperandA = typename ArchMmaOperator::FragmentA;
+    using InstMmaOperandB = typename ArchMmaOperator::FragmentB;
 
     //
     // Define conversions from source type to instruction operands' type
diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
index bf3d98dfbe..4ab139023a 100644
--- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
@@ -147,11 +147,17 @@ class MmaGaussianComplexTensorOp<
   /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
   using Policy = Policy_;
 
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
   /// Shape of underlying instruction
-  using InstructionShape = typename Policy::Operator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
 
-  /// Underlying architecture tag
-  using ArchTag = typename Policy::Operator::ArchTag;
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
 
   /// Complex transform on A operand
   static ComplexTransform const kTransformA = TransformA;
@@ -159,8 +165,6 @@ class MmaGaussianComplexTensorOp<
   /// Complex transform on B operand
   static ComplexTransform const kTransformB = TransformB;
 
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
 
   /// Number of threads participating in warp-level matrix product
   static int const kThreadCount = 32;
@@ -173,7 +177,7 @@ class MmaGaussianComplexTensorOp<
     Operand::kA,
     ElementA,
     LayoutA,
-    MatrixShape<Policy::Operator::Shape::kM, Policy::Operator::Shape::kK>,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
     Policy::OpDelta::kRow,
     32,
     1
@@ -191,7 +195,7 @@ class MmaGaussianComplexTensorOp<
     Operand::kB,
     ElementB,
     LayoutB,
-    MatrixShape<Policy::Operator::Shape::kK, Policy::Operator::Shape::kN>,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
     Policy::OpDelta::kColumn,
     32,
     1
@@ -204,14 +208,14 @@ class MmaGaussianComplexTensorOp<
   using TransformedFragmentB = FragmentB;
 
   static_assert(
-    !(Shape::kM % Policy::Operator::Shape::kM) && 
-    !(Shape::kN % Policy::Operator::Shape::kN),
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
     "Shape of warp-level Mma must be divisible by operator shape.");
 
   /// Number of mma operations performed
   using MmaIterations = MatrixShape<
-    Shape::kM / Policy::Operator::Shape::kM,
-    Shape::kN / Policy::Operator::Shape::kN
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
   >;
 
   /// Iterates over the C operand in memory
@@ -219,7 +223,7 @@ class MmaGaussianComplexTensorOp<
      MatrixShape<Shape::kM, Shape::kN>, 
      ElementC, 
      LayoutC,
-     typename Policy::Operator::Shape, 
+     typename ArchMmaOperator::Shape, 
      typename Policy::OpDelta>;
 
   /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
@@ -229,7 +233,7 @@ class MmaGaussianComplexTensorOp<
   using FragmentC = typename IteratorC::Fragment;
 
   static_assert(
-    FragmentC::kElements == 3 * MmaIterations::kCount * Policy::Operator::FragmentC::kElements,
+    FragmentC::kElements == 3 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
     "Unexpected gaussian complex fragment length.");
 
 private:
@@ -239,7 +243,7 @@ class MmaGaussianComplexTensorOp<
   //
 
   /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
-  typename Policy::Operator mma;
+  ArchMmaOperator mma;
 
 public:
 
@@ -261,9 +265,9 @@ class MmaGaussianComplexTensorOp<
   ) const {
 
     // Alias types for underlying real-valued matrix multiply operator
-    using MmaOperandA = typename Policy::Operator::FragmentA;
-    using MmaOperandB = typename Policy::Operator::FragmentB;
-    using MmaOperandC = typename Policy::Operator::FragmentC;
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
 
     static_assert(MmaOperandA::kElements == 1, 
       "This implementation only supports math instructions in which exactly one element is needed for the A operand."
@@ -346,8 +350,6 @@ class MmaGaussianComplexTensorOp<
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-// TODO - partial specializations of real*complex and complex*real
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace warp
diff --git a/include/cutlass/gemm/warp/mma_simt.h b/include/cutlass/gemm/warp/mma_simt.h
index c90624cee7..306a08d17c 100644
--- a/include/cutlass/gemm/warp/mma_simt.h
+++ b/include/cutlass/gemm/warp/mma_simt.h
@@ -68,6 +68,10 @@ template <
   typename Policy_,
   /// Number of partitions along K dimension
   int PartitionsK = 1,
+  /// Complex transformation on operand A
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transformation on operand B
+  ComplexTransform TransformB = ComplexTransform::kNone,
   /// Used for partial specialization
   typename Enable = bool
 >
@@ -104,10 +108,10 @@ class MmaSimt {
   using ArchTag = arch::Sm50;
 
   /// Complex transform on A operand
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformA = TransformA;
 
   /// Complex transform on B operand
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = TransformB;
 
   /// Layout of threads
   using ThreadLayoutA = typename platform::conditional< platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA >::value,
@@ -215,12 +219,22 @@ class MmaSimt {
   CUTLASS_DEVICE
   void operator()(
     FragmentC &d, 
-    FragmentA const &a, 
-    FragmentB const &b, 
+    FragmentA a, 
+    FragmentB b, 
     FragmentC const &c, int group_idx = 0) const {
 
     ThreadMma mma;
 
+    if (kTransformA == ComplexTransform::kConjugate) {
+      conjugate<FragmentA> conj_a;
+      a = conj_a(a);
+    }
+
+    if (kTransformB == ComplexTransform::kConjugate) {
+      conjugate<FragmentB> conj_b;
+      b = conj_b(b);
+    }
+
     mma(d, a, b, c);
   }
 
diff --git a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
index 8b7312baa0..ba86e08583 100644
--- a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
@@ -111,17 +111,28 @@ class SparseMmaTensorOp {
   /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
   using Policy = Policy_;
 
+  /// Equivalant base dense mma
+  using Base = MmaTensorOp<Shape, ElementA, LayoutA, ElementB, LayoutB,
+                           ElementC, LayoutC, Policy, PartitionsK_,
+                           AccumulatorsInRowMajor, Enable>;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Base::ArchMmaOperator;
+
   /// Architecture tag from underlying instruction
-  using ArchTag = typename Policy::Operator::ArchTag;
+  using ArchTag = typename Base::ArchTag;
 
   /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
+  using OperatorClass = typename Base::OperatorClass;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename Base::InstructionShape;
 
   /// Complex transform on A operand
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformA = Base::kTransformA;
 
   /// Complex transform on B operand
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = Base::kTransformB;
 
   /// Number of threads participating in warp-level matrix product
   static int const kThreadCount = 32;
@@ -171,25 +182,19 @@ class SparseMmaTensorOp {
      Array<typename Policy::Operator::ElementA, FragmentA::kElements>;
 
  /// Iterates over the B operand in memory
- using IteratorB = MmaTensorOpMultiplicandTileIterator<
-     MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
-     MatrixShape<Policy::Operator::Shape::kK, Policy::Operator::Shape::kN>,
-     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+ using IteratorB = typename Base::IteratorB;
 
  /// Storage for B tile
- using FragmentB = typename IteratorB::Fragment;
+ using FragmentB = typename Base::FragmentB;
 
  /// Storage for transformed B tile
- using TransformedFragmentB =
-     Array<typename Policy::Operator::ElementB, FragmentB::kElements>;
+ using TransformedFragmentB = typename Base::TransformedFragmentB;
 
  /// Iterates over the C operand in memory
- using IteratorC = MmaTensorOpAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
-     typename Policy::Operator::Shape, typename Policy::OpDelta>;
+ using IteratorC = typename Base::IteratorC;
 
  /// Storage for C tile
- using FragmentC = typename IteratorC::Fragment;
+ using FragmentC = typename Base::FragmentC;
 
  /// Iterates over the E operand in memory
  using IteratorE = SparseMmaTensorOpMetaTileIterator<
@@ -204,23 +209,13 @@ class SparseMmaTensorOp {
  /// Storage for E tile
  using FragmentE = typename IteratorE::Fragment;
 
-private:
-
-  static_assert(
-    !(Shape::kM % Policy::Operator::Shape::kM) && 
-    !(Shape::kN % Policy::Operator::Shape::kN),
-    "Shape of warp-level Mma must be divisible by operator shape.");
-
-  /// Number of mma operations performed
-  using MmaIterations = MatrixShape<
-    Shape::kM / Policy::Operator::Shape::kM,
-    Shape::kN / Policy::Operator::Shape::kN
-  >;
+ /// Number of mma operations performed
+ using MmaIterations = typename Base::MmaIterations;
 
 public:
 
   /// Underlying matrix multiply operator (concept: arch::Mma)
-  typename Policy::Operator mma;
+  ArchMmaOperator mma;
 
 public:
 
@@ -299,21 +294,21 @@ class SparseMmaTensorOp {
     // Define conversions from source type to instruction type
     //
     FloatRoundStyle const kRoundA =
-        PreferredRoundingMode<typename Policy::Operator::ElementA,
+        PreferredRoundingMode<typename ArchMmaOperator::ElementA,
                               ElementA>::kRound;
     FloatRoundStyle const kRoundB =
-        PreferredRoundingMode<typename Policy::Operator::ElementB,
+        PreferredRoundingMode<typename ArchMmaOperator::ElementB,
                               ElementB>::kRound;
-    detail::ConvertAndPack<typename Policy::Operator::ElementA, ElementA,
+    detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
                            FragmentA::kElements / 2, kRoundA>
         convert_A;
-    NumericArrayConverter<typename Policy::Operator::ElementB, ElementB,
+    NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
                           FragmentB::kElements, kRoundB>
         convert_B;
     Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
         reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
-    Array<typename Policy::Operator::ElementA, FragmentA::kElements / 2> *
-        ptr_dst_A = reinterpret_cast<Array<typename Policy::Operator::ElementA,
+    Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
+        ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
                                            FragmentA::kElements / 2> *>(&dst_A);
 
     dst_B = convert_B(B);
diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h
index 1a10c7e4fe..a60a86020a 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op.h
@@ -244,8 +244,6 @@ class MmaTensorOp {
   /// Storage for C tile
   using FragmentC = typename IteratorC::Fragment;
 
-private:
-
   /// Number of mma operations performed
   using MmaIterations = MatrixShape<
     (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
index 1fe04e92af..59f68a42a1 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
@@ -1518,6 +1518,7 @@ class MmaTensorOpMultiplicandTileIterator<
     } else if (Layout::kFactor == 2) {
       // Super Matrix multiply kBlock = 32
       if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
+        // Matrix multiply 1688 A/B
         // (Q stands for 1 8x128bit block).
         // Q0
         // Q1
@@ -3191,10 +3192,430 @@ class MmaTensorOpAccumulatorTileIterator<
 
         int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
 
-	AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
-				 offset_ref.offset(TensorCoord(accum_m, accum_n)));
+        AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                 offset_ref.offset(TensorCoord(accum_m, accum_n)));
+
+        access_ptr[0] = frag_ptr[idx];               
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
+/// accumulator layout.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element typ
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    /// Interleaved N
+    int InterleavedN>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, Element_, cutlass::layout::TensorNCxHWx<InterleavedN>,
+    InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = int8_t;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorNCxHWx<InterleavedN>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    /// Number of elements in strided dimension that each STG writes
+    static int const kStridedPerSTG = 8;
+
+    /// Factor to calculate reorder index to pack accumulator.
+    static int const kPackedFactor = Shape::kColumn / 32;
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<Shape::kRow / kStridedPerSTG,
+                                      Shape::kColumn / InterleavedN>;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InterleavedN / 4;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  struct alignas((kElementsPerAccess * sizeof_bits<Element>::value / 8)) AccessType {
+      Array<Element, kElementsPerAccess> storage;
+  };
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<int32_t, Shape::kCount / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+  /// Row offset index globally
+  LongIndex global_offset_row_;
+
+  /// Column offset index globally
+  LongIndex global_offset_col_;
+
+  /// Output tensor size
+  TensorCoord extent_;
+
+  /// Alpha 
+  float alpha_;
+
+  /// Beta
+  float beta_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref,
+    int const lane_id,
+    TensorCoord extent,
+    float alpha = 1.0f,
+    float beta = 0.0f
+  ):
+    ref_(ref),
+    extent_(extent),
+    alpha_(alpha),
+    beta_(beta) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    global_offset_row_ = quad;
+
+    global_offset_col_ = lane_in_quad * kElementsPerAccess;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(MatrixCoord const &tile_offset) {
+
+    global_offset_row_ += tile_offset.row() * Shape::kRow;
+
+    global_offset_col_ += tile_offset.column() * Shape::kColumn;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kN; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kM; ++mma_m) {
+        int accum_m = mma_m * InstructionShape::kM;
+        int accum_n = mma_n * InstructionShape::kN;
+
+        int idx = mma_m + mma_n * Policy::MmaIterations::kM;
+
+        AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                 accum_m * offset_ref.stride(0) + accum_n);
+
+        frag_ptr[idx] = access_ptr[0];
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
 
-        access_ptr[0] = frag_ptr[idx];				 
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    Array<float, Shape::kCount / kThreads> output_frag_f;
+    Array<Element, Shape::kCount / kThreads> output_frag;
+
+    LongIndex pq = extent_.h() * extent_.w();
+
+    LongIndex extent_row = extent_.n() * pq;
+    LongIndex extent_col = extent_.c();
+
+    LongIndex k_major = (global_offset_col_ / InterleavedN) * pq;
+    Index k_minor = global_offset_col_ % InterleavedN;
+    LongIndex k_offset = k_major * InterleavedN + k_minor;
+    LongIndex k_offset_delta = pq * InterleavedN;
+
+    LongIndex stride_n = pq * extent_.c();
+
+    Index n;
+    LongIndex pq_rem;
+
+    unsigned int pq_mul, pq_shr;
+    find_divisor(pq_mul, pq_shr, pq);
+
+    if(beta_ == 0.0f) {
+      CUTLASS_PRAGMA_UNROLL
+      for(int i = 0; i < frag.size(); ++i) {
+        output_frag_f[i] = frag[i];
+      }
+
+      if(InstructionShape::kM == Policy::kStridedPerSTG) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < frag.size(); ++i) {
+          output_frag[i] = (Element)(output_frag_f[i] * alpha_);
+        }
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < frag.size(); ++i) {
+          int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor)
+                    + (i % (8 * Policy::kPackedFactor)) / 2 * 4
+                    + (i % (8 * Policy::kPackedFactor)) % 2
+                    + (i / (8 * Policy::kPackedFactor)) % 2 * 2;
+          output_frag[i] = (Element)(output_frag_f[map_i] * alpha_);
+        }
+      }
+
+      AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&output_frag);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        int accum_m = mma_m * Policy::kStridedPerSTG;
+
+        fast_divmod(n, pq_rem, global_offset_row_ + accum_m, pq, pq_mul, pq_shr);
+        LongIndex offset_m = n * stride_n + k_offset + pq_rem * InterleavedN;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+       
+          int accum_n = mma_n * InterleavedN;
+
+          int idx = mma_n + mma_m * Policy::MmaIterations::kColumn;
+         
+          if((global_offset_row_ + accum_m < extent_row) && (global_offset_col_ + accum_n < extent_col)) {
+            AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                                                    offset_m + mma_n * k_offset_delta);
+
+            access_ptr[0] = frag_ptr[idx];
+          }
+        }
+      }
+    } else {
+      if(InstructionShape::kM == Policy::kStridedPerSTG) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < frag.size(); ++i) {
+          output_frag_f[i] = frag[i];
+        }
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < frag.size(); ++i) {
+          int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor)
+                    + (i % (8 * Policy::kPackedFactor)) / 2 * 4
+                    + (i % (8 * Policy::kPackedFactor)) % 2
+                    + (i / (8 * Policy::kPackedFactor)) % 2 * 2;
+          output_frag_f[i] = frag[map_i];
+        }
+      }
+
+      AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&output_frag);
+
+      Array<Element, kElementsPerAccess> ref_frag;
+      AccessType *ref_frag_ptr = reinterpret_cast<AccessType *>(&ref_frag);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        int accum_m = mma_m * Policy::kStridedPerSTG;
+
+        fast_divmod(n, pq_rem, global_offset_row_ + accum_m, pq, pq_mul, pq_shr);
+        LongIndex offset_m = n * stride_n + k_offset + pq_rem * InterleavedN;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+       
+          int accum_n = mma_n * InterleavedN;
+
+          int idx = mma_n + mma_m * Policy::MmaIterations::kColumn;
+         
+          if((global_offset_row_ + accum_m < extent_row) && (global_offset_col_ + accum_n < extent_col)) {
+            AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                                                    offset_m + mma_n * k_offset_delta);
+
+            ref_frag_ptr[0] = access_ptr[0];
+
+            CUTLASS_PRAGMA_UNROLL
+            for(int i = 0; i < kElementsPerAccess; ++i) {
+              output_frag[idx * kElementsPerAccess + i] = Element(alpha_ * output_frag_f[idx * kElementsPerAccess + i]
+                                                                + beta_ * ref_frag[i]);
+            }
+
+            access_ptr[0] = frag_ptr[idx];
+          }
+        }
       }
     }
   }
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
index ed6384f05a..c57cc6a8d9 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
@@ -2243,6 +2243,847 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
   }
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for 'TN' arrangement
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of matrix operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads = 32,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess = 4;
+
+private:
+
+  static int const kInterleavedTileRows = 32;
+  static int const kInterleavedTileColumns = 32;
+  static int const kInstructionsPerTile = 2;
+  
+  /// Rounded up instruction counts
+  using TileCount = MatrixShape<
+    Shape::kRow / kInterleavedTileRows,
+    Shape::kColumn / kInterleavedTileColumns
+  >;
+
+  using FragmentCount = MatrixShape<
+    TileCount::kRow * kInstructionsPerTile,
+    TileCount::kColumn * kInstructionsPerTile
+  >;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    (kOperand == Operand::kA ? FragmentCount::kRow : FragmentCount::kColumn) * kElementsPerAccess
+  >;
+
+  /// Memory access type
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+private:
+
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(): divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(
+    TensorRef const &ref, 
+    int lane_id
+  ): 
+    ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
+
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile + lane_in_quad;
+      int col_idx = 0;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = 0;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile  + lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(
+    TensorRef const &ref, 
+    TensorCoord extent,
+    int lane_id
+  ): ref_(ref), extent_(extent), divisible_(false) {
+  
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile  + lane_in_quad;
+      int col_idx = 0;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = 0;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile  + lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    #if defined(__CUDA_ARCH__)
+    __syncthreads();
+    #endif
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner &add_pointer_offset(LongIndex offset) {
+
+    ref_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner &add_tile_offset(TensorCoord const &tile_offset) {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator++() {
+
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    }
+    else {
+      add_tile_offset({1, 0});
+    }    
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator--() {
+    
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, -1});
+    }
+    else {
+      add_tile_offset({-1, 0});
+    }    
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    AccessType const *access_ptr = reinterpret_cast<AccessType const *>(ref_.data());
+    int ldm = ref_.stride()[0];
+
+    if (kOperand == Operand::kA) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kRow; ++idx) {
+        
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int row_offset = tile_idx * kInterleavedTileRows + quad_idx * 4;
+        frag_ptr[idx] = access_ptr[row_offset * ldm / kElementsPerAccess];
+      } 
+    }
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kColumn; ++idx) {
+
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int col_offset = tile_idx * kInterleavedTileColumns + quad_idx * 4;
+        frag_ptr[idx] = access_ptr[col_offset * ldm / kElementsPerAccess];
+      } 
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+
+    load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation
+  }
+};
+
+
+/// Tile iterator specialized for 'NT' arrangement
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of matrix operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads = 32,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess = 4;
+
+private:
+
+  static int const kInterleavedTileRows = 32;
+  static int const kInterleavedTileColumns = 32;
+  static int const kInstructionsPerTile = 2;
+  
+  /// Rounded up instruction counts
+  using TileCount = MatrixShape<
+    Shape::kRow / kInterleavedTileRows,
+    Shape::kColumn / kInterleavedTileColumns
+  >;
+
+  using FragmentCount = MatrixShape<
+    TileCount::kRow * kInstructionsPerTile,
+    TileCount::kColumn * kInstructionsPerTile
+  >;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    (kOperand == Operand::kA ? FragmentCount::kRow : FragmentCount::kColumn) * kElementsPerAccess
+  >;
+
+  /// Memory access type
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+private:
+
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(): divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(
+    TensorRef const &ref, 
+    int lane_id
+  ): 
+    ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
+
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile;
+      int col_idx = lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = lane_in_quad;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(
+    TensorRef const &ref, 
+    TensorCoord extent,
+    int lane_id
+  ): ref_(ref), extent_(extent), divisible_(false) {
+  
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile;
+      int col_idx = lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = lane_in_quad;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    #if defined(__CUDA_ARCH__)
+    __syncthreads();
+    #endif
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter &add_pointer_offset(LongIndex offset) {
+
+    ref_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter &add_tile_offset(TensorCoord const &tile_offset) {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator++() {
+
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    }
+    else {
+      add_tile_offset({1, 0});
+    }    
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator--() {
+    
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, -1});
+    }
+    else {
+      add_tile_offset({-1, 0});
+    }    
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    AccessType const *access_ptr = reinterpret_cast<AccessType const *>(ref_.data());
+    int ldm = ref_.stride()[0];
+
+    if (kOperand == Operand::kA) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kRow; ++idx) {
+        
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int row_offset = tile_idx * kInterleavedTileRows;
+        frag_ptr[idx] = access_ptr[row_offset / kElementsPerAccess + quad_idx];
+      }
+    }
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kColumn; ++idx) {
+
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int col_offset = tile_idx * kInterleavedTileColumns;
+        frag_ptr[idx] = access_ptr[col_offset / kElementsPerAccess + quad_idx];
+      } 
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+
+    load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+  Shape_, 
+  Operand::kA, 
+  Element_,
+  cutlass::layout::RowMajor,
+  InstructionShape_, 
+  OpDelta_,
+  32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kA, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kA, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> ;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+
+};
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+  Shape_, 
+  Operand::kA, 
+  Element_,
+  cutlass::layout::ColumnMajor,
+  InstructionShape_, 
+  OpDelta_,
+  32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> ;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+
+};
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_,
+    cutlass::layout::ColumnMajor,
+    InstructionShape_, OpDelta_, 32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kB, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kB, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_>;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+};
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_,
+    cutlass::layout::RowMajor,
+    InstructionShape_, OpDelta_, 32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kB, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kB, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_>;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace warp
 } // namespace gemm
 } // namespace cutlass
diff --git a/include/cutlass/layout/tensor.h b/include/cutlass/layout/tensor.h
index f3d5a12bf8..7f608dcf76 100644
--- a/include/cutlass/layout/tensor.h
+++ b/include/cutlass/layout/tensor.h
@@ -40,6 +40,7 @@
 #endif
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
+#include "cutlass/layout/pitch_linear.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/coord.h"
 #include "cutlass/tensor_coord.h"
@@ -120,6 +121,12 @@ class TensorNHWC {
       LongIndex(stride_[1] * coord.h()) +
       LongIndex(stride_[2] * coord.n());
   }
+  
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const {
+    return coord.contiguous() + LongIndex(coord.strided() * stride_[2]);
+  }
 
   /// Returns the logical coordinate (n, h, w, c) from a given offset in linear memory.
   CUTLASS_HOST_DEVICE
@@ -182,7 +189,6 @@ class TensorNHWC {
   }
 };
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Mapping function for 4-D NCHW tensors.
@@ -424,6 +430,14 @@ class TensorCxRSKx {
       LongIndex(stride_[2] * c_major);
   }
 
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord const &coord) const {
+    return (coord.contiguous() % kInterleave) +
+      LongIndex((coord.contiguous() / kInterleave) * stride_[2]) +
+      LongIndex(coord.strided() * kInterleave);
+  }
+
   /// Returns the stride of the layout
   CUTLASS_HOST_DEVICE
   Stride stride() const {
diff --git a/include/cutlass/transform/pitch_linear_thread_map.h b/include/cutlass/transform/pitch_linear_thread_map.h
index de21ede4ea..c19f79cbbc 100644
--- a/include/cutlass/transform/pitch_linear_thread_map.h
+++ b/include/cutlass/transform/pitch_linear_thread_map.h
@@ -340,6 +340,134 @@ struct PitchLinearWarpRakedThreadMap {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Policy defining a warp-raked arrangement in which a shape is partitioned into contiguous
+/// elements. Warps are arranged based on a stride.
+///
+/// This ThreadMap is used by tensor core kernels for NCxHWx layout.
+template <
+  typename Shape_,
+  int Threads,
+  typename WarpThreadArrangement_,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearStridedWarpRakedThreadMap {
+
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+
+  /// Tile shape
+  using Shape = Shape_;
+
+  /// Number of threads total
+  static int const kThreads = Threads;
+
+  using WarpThreadArrangement = WarpThreadArrangement_;
+
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  /// Base ThreadMap
+  using BaseThreadMap = PitchLinearWarpRakedThreadMap<
+    Shape,
+    kThreads,
+    WarpThreadArrangement,
+    kElementsPerAccess
+  >;
+
+  /// Shape of access by each thread
+  using ThreadAccessShape = typename BaseThreadMap::ThreadAccessShape;
+
+
+  struct Detail {
+
+    using WarpThreadArrangement = WarpThreadArrangement_;
+
+    using WarpAccessIterations = typename BaseThreadMap::Detail::WarpAccessIterations;
+
+    static int const kWarpSize = BaseThreadMap::Detail::kWarpSize;
+
+    static int const kWarpCount = BaseThreadMap::Detail::kWarpCount;
+
+    using ShapeInAccesses = typename BaseThreadMap::Detail::ShapeInAccesses;
+
+    // Divide it into the number of warps, first partitioning the contiguous dimension then the
+    // stride.
+    static int const kWarpsContiguous =
+        (WarpAccessIterations::kContiguous >= kWarpCount
+             ? kWarpCount
+             : WarpAccessIterations::kContiguous);
+
+    static int const kWarpsStrided =
+        (kWarpCount > WarpAccessIterations::kContiguous
+             ? kWarpCount / kWarpsContiguous
+             : 1);
+
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement = layout::PitchLinearShape<
+      kWarpsContiguous, kWarpsStrided
+    >;
+
+  };
+
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations = layout::PitchLinearShape<
+    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
+    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
+  >;
+
+  static_assert(Iterations::kCount,
+    "Number of iterations must be non-zero");
+
+  ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
+  using Delta = typename BaseThreadMap::Delta;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+
+    //
+    // compute warp-level offset
+    //
+
+    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+      Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
+    };
+
+    // This is the offset of a specific warp (in units of vectors)
+    layout::PitchLinearCoord warp_offset{
+      (warp_id % Detail::kWarpsContiguous),
+      (warp_id / Detail::kWarpsContiguous)
+    };
+
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+      lane_id % Detail::WarpThreadArrangement::kContiguous,
+      lane_id / Detail::WarpThreadArrangement::kContiguous
+    };
+
+    // This is the offset of a thread within a threadblock tile (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec = 
+      warp_footprint * warp_offset + thread_offset_in_warp;
+
+    // This is the offset of a thread within a threadblock tile (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+      thread_offset_in_threadblock_tile_vec.strided()
+    };
+
+    return thread_offset_in_threadblock_tile_base;
+  }
+
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Transpose the existing ThreadMap.  For example, interleaved layout is like
 /// congruous in the global memory and crosswise in the shared memory.  We need
 /// to transpose the coordinates between two.
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
index 7e34b546be..7dce3228ec 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
@@ -500,7 +500,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Specialization of PredicatedTileAccessIterator for pitch-linear data.
+/// Specialization of PredicatedTileAccessIterator for column-major data.
 ///
 /// Satisfies: ForwardTileIteratorConcept |
 ///            ReadableContiguousTileIteratorConcept |
@@ -676,7 +676,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Specialization of PredicatedTileAccessIterator for pitch-linear data.
+/// Specialization of PredicatedTileAccessIterator for row-major data.
 ///
 /// Satisfies: ForwardTileIteratorConcept |
 ///            ReadableContiguousTileIteratorConcept |
@@ -852,8 +852,8 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Specialization of PredicatedTileAccessIterator for interleaved data.  It
-/// is mapped to the congruous layout.
+/// Specialization of PredicatedTileAccessIterator for column-major interleaved data.  
+/// It is mapped to the congruous layout.
 ///
 /// Satisfies: ForwardTileIteratorConcept |
 ///            ReadableContiguousTileIteratorConcept |
@@ -1032,8 +1032,8 @@ class PredicatedTileAccessIterator<Shape_, Element_,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Specialization of PredicatedTileAccessIterator for interleaved data.  It
-/// is mapped to the congruous layout.
+/// Specialization of PredicatedTileAccessIterator for row-major interleaved data.  
+//  It is mapped to the congruous layout.
 ///
 /// Satisfies: ForwardTileIteratorConcept |
 ///            ReadableContiguousTileIteratorConcept |
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
index 30f9a20f08..32043130bd 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
@@ -805,268 +805,6 @@ class RegularTileAccessIterator<Shape_, Element_,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Tile iterator specialized for k interleaved arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
-                                                    InterleavedK>,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout =
-      layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
-                                                      InterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-    /// This iterator is specialized for an access size that is 128 bits in
-    /// length.
-    static int const kAccessSizeInBits = 128;
-
-    static_assert(sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess ==
-                      kAccessSizeInBits,
-                  "This iterator requires a policy whose access size is 128bs");
-  };
-
- private:
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Internal pointer to first access of tile
-  AccessType *pointer_;
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                      int thread_id   ///< ID of each participating thread
-                      )
-       : byte_offset_(0) {
-    layout::PitchLinearCoord thread_offset_base =
-        ThreadMap::initial_offset(thread_id);
-
-    // initialize pointer
-    pointer_ = reinterpret_cast<AccessType *>(
-        ref.data() + ref.offset(thread_offset_base));
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    AccessType *access_ptr = pointer_;
-
-    int access_offset =
-        (iteration_strided_ * ThreadMap::Delta::kStrided * Layout::kInterleavedK +
-        iteration_contiguous_ * ThreadMap::Delta::kContiguous) / ThreadMap::kElementsPerAccess;
-
-    char *access_byte_ptr =
-        reinterpret_cast<char *>(access_ptr + access_offset);
-
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
-      return *this;
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    add_pointer_offset(coord.contiguous() * Shape::kCount);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for k interleaved arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-
-template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
-                                             InterleavedK>,
-    AdvanceRank, ThreadMap_, Alignment> {
-
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout =
-      layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
-                                                         InterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-    cutlass::MatrixShape<Shape::kColumn, Shape::kRow>,
-    Element,
-    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value, InterleavedK>,
-    (kAdvanceRank == 1 ? 0 : 1),
-    ThreadMap
-  >;
-
- private:
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                      int thread_id   ///< ID of each participating thread
-                      )
-       : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-    iterator_.set_iteration_index(index);
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return iterator_.get();
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.strided(), coord.contiguous()});
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
 }  // namespace threadblock
 }  // namespace transform
 }  // namespace cutlass
diff --git a/media/docs/functionality.md b/media/docs/functionality.md
index 3c416b3e9d..77f1ba142c 100644
--- a/media/docs/functionality.md
+++ b/media/docs/functionality.md
@@ -44,6 +44,28 @@ Hyperlinks to relevant unit tests demonstrate how specific template instances ma
 | **SpTensorOp**      | 80                 |  11.1+           | `s4 * s4 + s32 => {s4, s32}`       | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu) |
 
 
+## Device-level Implicit GEMM convolution
+
+The following table summarizes device-level implicit GEMM convolution kernels in CUTLASS, organized by opcode class, data type, and layout.
+Hyperlinks to relevant conv2d fprop unit tests demonstrate how specific template instances may be defined. 
+One can find and/or create equivalent dgrad and wgrad convolutional operators.
+
+|**Opcode Class** | **Compute Capability** | **CUDA Toolkit** | **Data Type**                  | **Layouts**      | **Unit Test**    |
+|-----------------|------------------------|------------------|--------------------------------|------------------|------------------|
+| **Simt**            | 50,60,61,70,75     |  9.2+            | `f32 * f32 + f32 => f32`       | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu)                |
+| **Simt**            | 50,60,61,70,75     |  9.2+            | `cf32 * cf32 + cf32 => cf32`   | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu)                |
+| **TensorOp**        | 70                 |  10.1+           | `f16 * f16 + f32 => {f16, f32}`| NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu) |
+| **TensorOp**        | 75                 |  10.2+           | `f16 * f16 + f32 => {f16, f32}`| NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu) |
+| **TensorOp**        | 75                 |  10.2+           | `s8 * s8 + s32 => {s32, s8}`   | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu) |
+| **Simt**            | 80                 |  11.0+            | `f32 * f32 + f32 => f32`       | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu)                |
+| **Simt**            | 80                 |  11.0+            | `cf32 * cf32 + cf32 => cf32`   | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu)                |
+| **TensorOp**        | 80                 |  11.0+           | `f16 * f16 + f32 => {f16, f32}`| NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `f16 * f16 + f16 => f16`       | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `tf32 * tf32 + f32 => f32`     | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `s8 * s8 + s32 => {s32, s8}`   | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `s4 * s4 + s32 => {s32, s4}`   | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu) |
+
+
 
 ## Warp-level Matrix Multiply with Tensor Cores
 
diff --git a/media/docs/implicit_gemm_convolution.md b/media/docs/implicit_gemm_convolution.md
new file mode 100644
index 0000000000..34102918d3
--- /dev/null
+++ b/media/docs/implicit_gemm_convolution.md
@@ -0,0 +1,779 @@
+![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Implicit GEMM API")
+
+[README](/README.md#documentation) > **Implicit GEMM Convolution**
+
+# CUTLASS Convolution
+
+Implicit GEMM is the formulation of a convolution operation as a GEMM (generalized matrix-matrix
+product). Convolution takes an activation tensor and applies a sliding filter on it to produce an
+output tensor. 
+
+## Introduction
+
+This release of CUTLASS contains several artifacts related to convolution.
+
+- [**Implicit GEMM Algorithm**](implicit_gemm_convolution.md#implicit-gemm-algorithm)
+- [**CUTLASS Convolution Implementation**](implicit_gemm_convolution.md#cutlass-convolution-implementation)
+- [**Convolution Examples**](implicit_gemm_convolution.md#convolution-example)
+
+
+# Implicit GEMM Algorithm
+
+2-D convolution may be mapped to matrix multiply by forming a _convolution matrix_ containing
+elements of the activations tensor then multiplying this by a matrix formed from the filters tensor.
+The earliest form of this algorithm construct the convolution matrix explicitly via an operation
+conventionally referred to as `im2col`. The resulting matrix replicates each activation element by a factor
+equal to the filter size, consuming additional storage capacity and memory bandwidth.
+
+The _implicit GEMM_ algorithm is a variation on the blocked, hierarchical GEMM computation in CUDA
+that instead forms tiles of the convolution matrix on the fly as data is loaded from global memory
+into Shared Memory by carefully updating pointers and predicates. Once the convolution matrix is
+formed in Shared Memory, the existing components computing warp-level GEMM accumulate the result of
+convolution and update the output tensor.
+
+This section describes the structure of an efficient Implicit GEMM Convolution CUDA kernel
+for Turing Tensor Cores. 
+
+## Mapping Convolution to GEMM
+
+The forward convolutional layer computes an output tensor _y = conv(x, w)_ where x(NHWC), w(KRSC), and y(NPQK)
+are 4-D tensors. 
+
+This computation may be described by the following analytic function.
+
+```
+y[n, p, q, k] = sum_c(sum_r(sum_s( x[n, f(p, r), g(q, s), c] * w[k, r, s, c] )))
+```
+where functions _f_ and _g_ are defined as follows.
+
+```
+f(p, r) = p * stride_h + R - r - 1 + pad_h
+g(q, s) = h * stride_w + S - s - 1 + pad_w
+```
+
+A [host](/tools/util/include/reference/host/convolution.h) and [device](/tools/util/include/reference/device/convolution.h) 
+reference implementation are provided in the CUTLASS Utilities.
+
+This computation may be mapped to the elements of a matrix product as follows.
+
+```
+C = gemm(A, B)
+```
+where
+- A is a row-major matrix of extent _NHW_-by-_RSC_ containing activations
+- B is a column-major matrix of extent _RSC_-by-_K_ containing filters
+- C is a row-major matrix of extent _NPQ_-by-_K_ containing the output
+
+Each element of the output matrix _Cij_ corresponds to an element in the output tensor y[n, p, q, k] according to
+the following relation.
+```
+y[n, p, q, k] = Cij
+```
+where
+```
+i = q + Q * (p + P * n)
+j = k
+```
+
+These relations may be inverted as follows.
+```
+k = j
+
+n = i / (PQ)
+residual = i % (PQ)
+
+p = residual / Q
+q = residual % Q
+```
+
+The triple loop nest iterating over CRS to accumulate the result may also be linearized and mapped to the inner
+GEMM _K_ dimension (not to be confused with the filter tensor dimension _K_) by the following relations.
+
+```
+gemm_k = s + S * (r + R * c)
+```
+and inverse
+```
+c = gemm_k / (RS)
+residual = gemm_k % (RS)
+
+r = residual / S
+s = residual % S
+```
+
+Given these equations, a GEMM triple loop nest could be augmented with tensor indexing as follows.
+```c++
+int GEMM_M = N * P * Q;
+int GEMM_N = K;
+int GEMM_K = C * R * S;
+
+for (int gemm_i = 0; gemm_i < GEMM_M; ++gemm_i) {
+  for (int gemm_j = 0; gemm_j < GEMM_N; ++gemm_j) {
+
+    int n = gemm_i / (PQ);
+    int npq_residual = gemm_i % (PQ);
+
+    int p = npq_residual / Q;
+    int q = npq_residual % Q;
+
+    Accumulator accum = 0;
+
+    for (int gemm_k = 0; gemm_k < GEMM_K; ++gemm_k) {
+
+      int k = gemm_j;
+
+      int c = gemm_k / (RS);
+      int crs_residual = gemm_k % (RS);
+
+      int r = crs_residual / S;
+      int s = crs_residual % S;
+
+      int h = f(p, r);
+      int w = g(q, s);
+
+      ElementA a = tensor_A.at({n, h, w, c});
+      ElementB b = tensor_B.at({k, r, s, c});
+
+      accum += a * b;
+    }
+
+    C[gemm_i * K + gemm_j] = accum;
+  }
+}
+```
+The [CUTLASS GEMM implementation](/media/docs/efficient_gemm.md) explicitly iterates over tiles. Consequently, 
+a tile iterator could be implemented to compute these functions analytically and load the appropriate
+elements. However, the resulting modulo arithmetic would be computationally intensive, and overhead would
+limit performance of a GEMM kernel targeting Turing Tensor Cores. 
+
+The following section describes how an efficient implementation may be implemented within the structure of
+a hierarchical GEMM kernel targeting Tensor Cores.
+
+
+# CUTLASS Convolution Implementation
+
+The CUTLASS Implicit GEMM implementation makes several assumptions.
+
+- All tensors are 128-bit aligned NHWC tensors
+- Channel count (C) is a multiple of 32 elements
+- Filter count (K) is a multiple of 32 elements
+
+This enables 128-bit vector memory acceses which lead to efficient CUDA kernels. 
+
+# CUTLASS Device-level Convolution Operator
+
+CUTLASS defines CUDA C++ templates accepting numerous template arguments to specialize the resulting
+kernel by operation, data type, tile configuration, math instruction, and fused output operation.
+
+In [09_turing_tensorop_conv2dfprop.cu](/examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop.cu), a convolution
+operation is defined as follows.
+
+```c++
+/// Define an Implicit GEMM convolution forward propagation (fprop) kernel
+using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+  ElementInputA,                                          // data type of element a (mapped to activation for fprop)                         
+  LayoutInputA,                                           // layout of element a (mapped to activation for fprop)
+  ElementInputB,                                          // data type of element b (mapped to filters for fprop)  
+  LayoutInputB,                                           // layout of element b (mapped to filters for fprop)
+  ElementC,                                               // data type of element c (mapped to output for fprop)
+  LayoutC,                                                // layout of element c (mapped to output for fprop)
+  ElementAccumulator,                                     // data type of internal accumulation
+  MMAOp,                                                  // opcode class tag
+  SmArch,                                                 // target SM architecture
+  ThreadblockShape,                                       // shape of threadblock tile
+  WarpShape,                                              // shape of warp-level GEMM tile
+  InstructionShape,                                       // shape of target math instruction
+  EpilogueOp,                                             // epilogue operator 
+  SwizzleThreadBlock,                                     // optional function to reorder threadblocks for locality
+  NumStages,                                              // number of pipeline stages in threadblock-scoped GEMM
+  cutlass::arch::OpMultiplyAddSaturate,                   // math operation on data of element a and b
+  cutlass::conv::IteratorAlgorithm::kAnalytic             // globabl memory iterator algorithm  
+>::Kernel
+```
+
+This template is intended to be generic and cover all feasible configurations. The example specifies
+the following concrete data types, layouts, and tile sizes.
+
+```c++
+/// Define an Implicit GEMM convolution forward propagation (fprop) kernel
+using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+  cutlass::int4b_t,                                    // data type of element a (mapped to activation for fprop)                         
+  cutlass::layout::TensorNHWC,                         // layout of element a (mapped to activation for fprop)
+  cutlass::int4b_t,                                    // data type of element b (mapped to filters for fprop)  
+  cutlass::layout::TensorNHWC,                         // layout of element b (mapped to filters for fprop)
+  int32_t,                                             // data type of element c (mapped to output for fprop)
+  cutlass::layout::TensorNHWC,                         // layout of element c (mapped to output for fprop)
+  int32_t,                                             // data type of internal accumulation
+  cutlass::arch::OpClassTensorOp,                      // opcode class tag
+  cutlass::arch::Sm75,                                 // target SM architecture
+  cutlass::gemm::GemmShape<128, 128, 128>,             // shape of threadblock tile
+  cutlass::gemm::GemmShape<64, 64, 128>,               // shape of warp-level GEMM tile
+  cutlass::gemm::GemmShape<8, 8, 32>,                  // shape of target math instruction
+  cutlass::epilogue::thread::LinearCombinationClamp<
+    int32_t,                                           // data type of output matrix
+    8,                                                 // The number of elements per vectorized
+                                                       // memory access. This becomes the vector width of
+                                                       // math instructions in the epilogue too.
+    int32_t,                                           // Data type of accumulator
+    float>;    ,                                       // epilogue operator 
+  SwizzleThreadBlock,                                  // optional function to reorder threadblocks for locality
+  2,                                                   // number of pipeline stages in threadblock-scoped GEMM
+  cutlass::arch::OpMultiplyAddSaturate,                // math operation on data of element a and b
+  cutlass::conv::IteratorAlgorithm::kAnalytic          // globabl memory iterator algorithm  
+>::Kernel
+```
+
+That is, this computes 2D convolutional forward propagation with 4-bit integer inputs and outputs (`cutlass::int4b_t`). 
+Internal accumulation is performed using 32-bit integers (`int32_t`), and an elementwise linear combination operation
+is performed on the output in single-precision floating point (`float`).
+
+The threadblock and warp-level tile sizes refer to the hierarhically blocked GEMM computation 
+[described here](/media/docs/gemm_api.md). Larger tiles achieve greater reuse of data loaded through shared memory
+but launch fewer CTAs and may not fully occupy the GPU for small problem sizes. Smaller tile configurations achieve
+lower peak utilizations but may better match the number of SMs within the GPU for real-world workloads.
+
+
+## Launching the convolution
+
+The following code collects the arguments for an implicit GEMM operation into a structure.
+
+```c++
+//
+// Define arguments for CUTLASS Convolution
+//
+
+// mode (kCrossCorrelation or kConvolution)
+cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+// Split K dimension into 1 partitions
+int split_k_slices = 1;
+
+cutlass::conv::Conv2dProblemSize problem_size(      
+    options.input_size,
+    options.filter_size,
+    options.padding,
+    options.conv_stride,
+    options.dilation,
+    options.output_size(),
+    mode,
+    split_k_slices);
+
+typename ImplicitGemm::Arguments arguments{
+  problem_size,
+  tensor_a.device_ref(),
+  tensor_b.device_ref(),
+  tensor_c.device_ref(),
+  tensor_c.device_ref(),
+  {options.alpha, options.beta},
+};
+```
+
+The `mode` flag indicates whether to compute cross correlation or convolution. The arguments 
+`input_size`, `filter_size`, `padding`, `conv_stride`, and `dilation` specify the dimensions of the
+input and output tensors and characterize the problem size.
+
+The arguments `tensor_a.device_ref()`, `tensor_b.device_ref()`, and `tensor_c.device_ref()` are
+CUTLASS `TensorRef<>` objects containing a pointer to the tensor data in GPU device memory and stride values.
+
+The following code initializes and launches the Implicit GEMM operation on the device. After initializing
+the arguments structure, it is used to query device-side workspace requirements and allocate them
+in device memory if needed.
+
+Then, the Implicit GEMM object is initialized with the `arguments` structure and the workspace in
+device memory. This initialization step precomputes internal lookup tables used by the convolution kernel
+and may also clear the device-side workspace if needed.
+
+Finally, the initialized Implicit GEMM object is called, launching a kernel on the device. `tensor_c` now
+contains the result of the implicit GEMM.
+
+```c++
+ImplicitGemm implicit_gemm_op;
+
+// Query workspace size
+size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+// Allocate workspace memory
+cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+// Initialize the Implicit GEMM object
+cutlass::Status status = implicit_gemm_op.initialize(arguments, workspace.get());
+
+if (status != cutlass::Status::kSuccess) {
+  /* error */
+}
+
+//
+// Launch initialized CUTLASS kernel
+//
+
+status = implicit_gemm_op();
+
+if (status != cutlass::Status::kSuccess) {
+  /* error */
+}
+```
+
+The example demonstrates how the input and output tensors may be written to a file as CSV using
+`cutlass::HostTensor<>` defined in the [CUTLASS Utilities](/media/docs/utilities.md).
+
+```c++
+  std::ofstream output_workspace(ss.str());
+
+  output_workspace 
+    << "Input = \n" << tensor_a.host_view() << "\n\n"
+    << "Filters = \n" << tensor_b.host_view() << "\n\n";
+
+  // Copy device memory to host backing store
+  tensor_c.sync_host();
+
+  output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
+```
+
+
+## CUTLASS Components
+
+CUTLASS defines the following CUDA C++ templates to implement Implicit GEMM Convolution which are described in greater detail in subsequent sections.
+
+**Activations tile iterators** load the activations tile into registers. Two implementations are provided:
+- [conv2d_fprop_activation_tile_access_iterator_analytic.h](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h) computes pointer deltas and masks analytically
+- [conv2d_fprop_activation_tile_access_iterator_optimized.h](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h) optimizes iterating over global memory and 
+creating GEMM-A tile in shared memory.
+
+**Filter tile iterators** load filters into registers. Similarly, two implementations are provided:
+- [conv2d_fprop_filter_tile_access_iterator_analytic.h](/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h) computes pointer deltas and masks analytically
+- [conv2d_fprop_filter_tile_access_iterator_optimized.h](/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h) optimizes iterating over global memory and 
+creating GEMM-B tile in shared memory.
+
+The improvements covered by optimized iterators are: 
+- (a) Precomputing kernel-invariant pointer deltas on the host 
+- (b) Computing cta-invariant mask predicates on device-side iterator ctors
+- (c) Use of [fast divmod](include/cutlass/fast_math.h) to map GEMM dimenstions to convolution tensors. 
+For example, _optimized_ activation iterator uses fast divmod to map GEMM _M_ to NPQ 
+for activation iterator
+
+
+**Pipelined mainloop** loads threadblock-scoped tiles from global memory into shared memory and then applies
+CUTLASS warp-level GEMM operations to load from Shared Memory and issue instructions to Turing Tensor Cores.
+- [mma_pipelined.h](/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h)
+
+Operations for storing to shared memory and performing warp-wide matrix multiply operations using
+Turing Tensor Cores are applied directly from the CUTLASS GEMM components. These include the
+following components.
+
+**Regular Tile Iterator** implemented in 
+[transform::threadblock::RegularTileIterator](/include/cutlass/transform/threadblock/regular_tile_iterator.h)
+stores register-backed fragments to Shared Memory in permuted layouts.
+
+**Warp-level GEMM** defined in [cutlass::gemm::warp::MmaTensorOp](/include/cutlass/gemm/warp/mma_tensor_op.h)
+defines tile iterators to load from Shared Memory and issue math instructions to Turing Tensor Cores.
+Further details are [described in here](/media/docs/gemm_api.md#warp-level-matrix-multiply-api).
+
+**Epilogue** reorders accumulator elements among threads within a threadblock to efficiently update
+the output tensor. It is implemented in [epilogue::threadblock::Epilogue](/include/cutlass/epilogue/threadblock/epilogue.h).
+
+### Loading Activations and Filters
+
+The Implicit GEMM Convolution algorithm partitions the GEMM _K_ dimension (of extent _CRS_) into
+threadblock tiles and assigning each threadblock tile to one filter position and an interval
+of channels. After iterating over all filter positions, the convolution algorithm advances to the
+next interval of channels and proceeds from filter `r=0, s=0`. 
+
+The matrix product of one threadblock tile is computed per iteration of 
+the mainloop as described in the [CUTLASS GEMM implementation](/media/docs/efficient_gemm.md). To
+summarize, the threadblock tile of activations and filters are loaded from tensors in global memory
+and stored to shared memory. Each thread within the threadblock loads one or more vectors and
+collectively span the entire tile. 
+
+The following figure illustrates one particular iteration of the Implicit GEMM mainloop. Each
+thread within the threadblock is mapped to several vectors of elements in the Activations and
+Filters tensors. Each index in the GEMM _M_ dimension corresponds to a unique _(N,P,Q)_
+index of the output tensor, and pointers may be computed based on this as well as 
+filter position _(r,s)_.
+
+![ALT](/media/images/conv2d-fprop-int4.png "Convolution Forward Propagation on INT4 data.")
+
+The CUTLASS component that embodies this functionality is [Conv2dFpropFilterTileAccessIteratorAnalytic](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h).
+Its constructor computes the mapping of GEMM _M_ to _(N, P, Q)_, the `at()` method maps the linear offset into the Activations 
+tensor for each memory access the thread is to perform. Additionally, the method `valid()` computes the valided of the access 
+for each filter position and for each memory access to indicate whether the memory access will be within the bounds of the 
+tensor or out of bounds. 
+
+`operator++()` iterates over memory accesses performed by a thread in both contiguous and strided dimension. 
+
+```c++
+// cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
+
+// Update iterator to thread's next contiguous, strided memory access
+Conv2dFpropActivationTileAccessIteratorAnalytic &operator++() {
+  ++iteration_contiguous_;
+  if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+    return *this;
+  }
+  iteration_contiguous_ = 0;
+  
+  ++iteration_strided_;
+  if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+    return *this;
+  }
+  iteration_strided_ = 0;
+ 
+  return *this;
+}
+```
+
+After all accesses have been visited for the current threadblock tile, `advance()` updates the pointers to next tile. 
+Offsets added to each pointer follows the traversal of filter positions, performing one of the
+following:
+- advance from filter position _(r, s, c)_ to filter position _(r, s+1, c)_
+- advance from filter position _(r, S-1, c)_ to filter position _(r+1, 0, c)_
+- advance from filter position _(R-1, S-1, c)_ to filter position _(0, 0, c+32)_ 
+
+This logic within method `advance()`'s body computes the above three updates for the activation GEMM-A tile.
+
+```c++
+// cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
+
+// Advance to the next access
+void advance() {
+  // moves to the next tile
+  ++filter_s_;
+  if (filter_s_ < problem_size_.S) {
+    return;
+  }
+  filter_s_ = 0;
+  
+  ++filter_r_;
+  if (filter_r_ < problem_size_.R) {
+    return;
+  }
+  filter_r_ = 0;
+  
+  filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+}
+```
+
+Similar logic holds for [Conv2dFpropFilterTileAccessIteratorAnalytic](/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h).
+
+To reduce computational overhead in the mainloop body, the pointer offsets may be precomputed
+in host code and provided to the CUDA kernel as a lookup table in its `Params` structure. 
+As shown in [Conv2dFpropFilterTileAccessIteratorOptimized](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h),
+the logic to compute offsets from filter position has been extracted to the `Params` constructor.
+
+```c++
+// cutlass/conv/threadblock/conv2d_params.h
+struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNHWC> {
+ ...
+// next S
+inc_next[0] = conv_sign * (int64_t(layout.stride()[0]) * problem_size.dilation_w) * element_size_bits / 8;
+
+// next R
+inc_next[1] = conv_sign * (
+    int64_t(layout.stride()[1]) * problem_size.dilation_h
+    - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+  ) * element_size_bits / 8;
+
+// next C
+inc_next[2] = (
+    threadblock_shape.column() * problem_size.split_k_slices
+    - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+    - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+  ) * element_size_bits / 8;
+
+ ...
+}
+```
+
+This allows only a simple lookup from the _delta table_ performed in device code in `Conv2dFpropActivationTileAccessIteratorOptimized::advance()`
+
+```c++
+// cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
+CUTLASS_HOST_DEVICE
+void advance() { 
+
+  int next_idx = 0;
+ 
+  // moves to the next tile
+  ++filter_s_;
+  if (filter_s_ == problem_size_.S) {
+    filter_s_ = 0;
+    ++filter_r_;
+ 
+    if (filter_r_ < problem_size_.R) {
+      next_idx = 1;
+    }
+    else {
+      filter_r_ = 0;
+      next_idx = 2;
+    }
+  }
+  
+  add_byte_offset_(params_.inc_next[next_idx]); // in addition to Conv2dFpropActivationTileAccessIteratorAnalytic::advance()
+
+  if (next_idx == 2) {  
+    filter_c_ += params_.filter_c_delta;
+  }
+}
+
+```
+
+### Utilizing Tensor Cores
+
+Turing Tensor Cores compute matrix multiply-accumulate operations efficiently by sharing data among all
+threads within a warp. The following operations are supported.
+
+|**Shape**|**A**|**B**|**C**|
+|---------|-----|-----|-----|
+| 8x8x32  | int4b_t | int4b_t | int32_t |
+| 8x8x16  | int8b_t | int8b_t | int32_t |
+| 16x8x8  | half   | half   | half    |
+| 16x8x8  | half   | half   | float   |
+
+Functionally, the Turing 8x8x32 matrix multiply operation distributes the _A_, _B_, and _C_ matrix across 32
+threads within a warp according to the following illustration.
+
+![ALT](/media/images/mma-8x8x32.png "Turing Tensor Op")
+
+This Tensor Core operation is accessible to the CUDA programmer via the PTX instruction
+[`mma.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-8832).
+CUTLASS wraps inline PTX with device-side intrinsics defined in [`cutlass/arch/mma_sm75.h`](/include/cutlass/arch/mma_sm75.h) 
+as in the following example.
+
+```c++
+unsigned A;   // eight packed 4-bit integer elements
+unsigned B;   // eight packed 4-bit integer elements
+
+int C[2];     // two 32-bit integer elements
+int D[2];     // two 32-bit integer elements
+
+asm volatile(
+  "mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+  : "=r"(D[0]), "=r"(D[1])
+  : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+```
+
+To efficiently load data from Shared Memory into registers with the distribution among
+warps matching the above, the Turing GPU architecture introduces 
+[`ldmatrix`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix).
+`ldmatrix` is the ultimate warp-cooperative instruction, as all threads contribute addresses to up to 32 row vectors of
+size 128-bits in length. These rows are fetched from Shared Memory and then distributed among groups of four threads
+per row.
+
+The arrangement of SMEM pointers and destination registers within threads is illustrated as follows. Thread 0 is highlighted
+in the illustration to emphasize the mapping. 
+
+![ALT](/media/images/ldmatrix-8x128bx4.png "Turing ldmatrix PTX instruction")
+
+The size of the Turing Tensor Core operation computing matrix multiply-accumulate on INT4 data is 8-by-8-by-32
+elements. `ldmatrix` fetches up to 32 rows (or columns) per operation. Sixteen Tensor Core operations may be issued
+to implement a 32-by-32-by-32 matrix product and perfectly consume all data loaded by two `ldmatrix` instructions
+as shown in the following figure. Larger tiles are possible by increasing the number of memory instructions
+and issuing more Tensor Core operations, up to warp-level matrix operations of size 64-by-64-by-32. The limit is
+the number of registers to hold the accumulator elements.
+
+![ALT](/media/images/ldmatrix-tensorop-32x32x32.png "Turing ldmatrix PTX instruction feeding Tensor Core operations")
+
+### Shared Memory Layouts
+
+In the previous two sections, we have described how data may be loaded from activations and filters tensors
+in global memory to compute convolution, and we have described a composition of `ldmatrix` and `mma.sync`
+to fetch data from Shared Memory and issue Tensor Core operations.
+
+To ensure this data movement is efficient, care must be taken to ensure bank conflicts are avoided. CUTLASS
+uses a permuted Shared Memory layout to avoid bank conflicts when storing to Shared Memory and to efficiently
+load from Shared Memory using `ldmatrix`. The following figure illustrates the thread mapping used for
+the loading the activations and filters threadblock tiles from global memory and the permuted layout in
+Shared Memory. 
+
+![ALT](/media/images/tensor-op-permuted-smem-layout-TN.png "Shared Memory layout used for Turing Tensor Cores")
+
+In the illustration, one warp-wide memory access is highlighted in blue, with individual threads
+loading one 128-bit vector. The tile in global memory could correspond either to the activations
+or filters and is assumed to be 'strip-mined' with four threads loading consecutive channels.
+
+Shared Memory is visualized as a 'row-major' matrix with eight columns representing
+the eight 128-bit banks.  
+As described in the CUTLASS GTC 2019 presentation [slides](https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9593-cutensor-high-performance-tensor-operations-in-cuda-v2.pdf), 
+[recording](https://developer.nvidia.com/gtc/2019/video/S9593), an access to Shared Memory will be conflict-free if
+the following conditions are satisfied across each warp:
+- {T0, T1, .., T7} do not access the same 128-bit bank
+- {T8, T9, .., T16} do not access the same 128-bit bank
+- {T16, T17, .., T23} do not access the same 128-bit bank
+- {T24, T25, .., T31} do not access the same 128-bit bank
+
+To achieve conflict-free stores, the Shared Memory layout remaps the strip-mined arrangement to transpose
+the vectors and applies an XOR operation on the column index of each thread's pointer. Specifically,
+
+```c++
+  int store_column = (lane_id % 8) ^ (lane_id / 8);
+```
+
+This transformation on the layout will be instrumental in reading slices of data from Shared Memory
+to compute the warp-level matrix multiply using Tensor Cores.
+
+The following figure shows how the first sixteen threads participating in an `ldmatrix` instruction 
+logically map to the c=0..31 slice of a matrix in Shared Memory. This slice is known as a "k-group" 
+within the code because it corresponds to the same K-index of a warp-level matrix multiply. 
+
+![ALT](/media/images/tensor-op-permuted-smem-layout-TN-k0.png "Load kgroup=0 from Shared Memory using ldmatrix")
+
+The lower half of the figure shows the physical arrangement in Shared Memory, with threads offset by row and column
+according to the XOR function. By inspection, we can observe there are no bank conflicts, as _T0 ... T7_ each access unique
+banks, as do _T8 ... T15_. and beyond.
+
+To advance to the next "k-group" within Shared Memory, pointers are updated using an XOR operation according to
+the following sequence:
+- **^1** advances from _k=0_ to _k=1_
+- **^3** advances from _k=1_ to _k=2_
+- **^1** advances from _k=2_ to _k=3_
+- **^3** advances from _k=3_ to _k=0_
+
+The first of these transitions is shown below.
+![ALT](/media/images/tensor-op-permuted-smem-layout-TN-k1.png "Advance to kgroup=1 from Shared Memory using ldmatrix")
+
+The [CUTLASS warp-level GEMM API](/media/docs/gemm_api.md#warp-level-matrix-multiply-api) defines templates for
+loading slices of data from permuted Shared Memory and issuing operations to Tensor Cores.
+
+### Updating the Output Tensor
+
+After the mainloop terminates, the accumulator tile of the warp-level GEMM stores a warp's contribution to the output
+tensor. However, the distribution of data among threads within the threadblock is specialized for efficient matrix multiply-accumulate
+operations using Tensor Cores and is not conducive to efficient, coalesced operations to Global Memory. A data rearrangement is
+needed. 
+
+The **Epilogue** is the component for exchanging accumulator elements through Shared Memory, loading slices of the output
+matrix or tensor, applying an elementwise operation such as linear scaling or bias, and storing the result to the output tensor. 
+CUTLASS structures this as several components:
+- [cutlass::epilogue::threadblock::Epilogue](/include/cutlass/epilogue/threadblock/epilogue.h) - the top-level component for looping over the entire threadblock tile
+- [cutlass::epilogue::warp::TileIteratorTensorOp](/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h) - a specialized component for storing accumulators for Tensor Core to Shared Memory
+- [cutlass::epilogue::threadblock::SharedLoadIterator](/include/cutlass/epilogue/threadblock/shared_load_iterator.h) - a component for loading elements from a row-major arrangement in Shared Memory
+- [cutlass::epilogue::threadblock::PredicatedTileIterator](/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h) - a component for loading or storing matrix fragments to Global Memory (with bounds checks)
+- [cutlass::epilogue::thread::LinearCombination](/include/cutlass/epilogue/thread/linear_combination.h) - an element-wise function computing `alpha * AB + beta * C` to compute the final output
+
+## Unit Tests
+
+Unit tests verify the functional behavior of each of the above components in a standalone CUDA kernel. This provides a
+convenient environment to (a.) inspect the template definition, (b.) showcase instantiation of use of these templates
+in device code, and (c.) assert functional correctness.
+
+**Convolution unit tests**
+- Device-wide convolution operator: [conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu)
+
+**GEMM unit tests**
+- Warp-scoped matrix multiply for Turing Tensor Cores: [gemm_sm75.cu](/test/unit/gemm/warp/gemm_sm75.cu)
+
+**Epilogue unit tests**
+- Epilogue for Turing Tensor Cores: [epilogue_tensor_op.cu](/test/unit/epilogue/threadblock/epilogue_tensor_op.cu)
+
+
+# Convolution Example
+
+This section describes the provided convolution example and is intended to orient the reader to the CUTLASS implementation
+of Implicit GEMM Convolution.
+
+## Building and Running the Example
+
+Example `09_turing_tensorop_conv2dfprop` computes a forward convolutional layer in which inputs and
+outputs are 4-b integers. The example source is visible in 
+[examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu](/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu).
+
+
+Before building the example, first perform the prerequisite steps for building any CUTLASS component [described here](/media/docs/quickstart.md).
+Compute capability 7.5 refers to the Turing architecture, and this work requires CUDA 10.2 Toolkit or later to target
+Turing Tensor Cores using the native `mma` [PTX instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-8832).
+
+```bash
+$ mkdir build && cd build
+
+$ cmake .. -DCUTLASS_NVCC_ARCHS=75
+```
+
+To build the example, execute `make 09_turing_tensorop_conv2dfprop` from the build directory.
+```bash
+$ make 09_turing_tensorop_conv2dfprop
+
+$ ls examples/09_turing_tensorop_conv2dfprop 
+examples/09_turing_tensorop_conv2dfprop
+
+```
+
+This example provides a simple command line interface to specify the extents of 4D tensors of 4-bit integer elements (`cutlass::int4b_t`),
+initialize them to random values, and compute the result of a convolutional layer. Optionally, the input and output
+tensors may be saved to .csv files, and the CUTLASS host-side reference check may be executed to verify correctness.
+
+The complete usage statement is visible by running with `--help`:
+```bash
+$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop --help
+09_turing_tensorop_conv2dfprop example
+
+  This example uses Turing's Tensor Core operators on int4 data types to compute
+  forward convolution on tensors of layout NHWC.
+
+Options:
+
+  --help               If specified, displays this usage statement.
+
+  --n <int>            Input tensor extent N
+  --h <int>            Input tensor extent H
+  --w <int>            Input tensor extent W
+  --c <int>            Input tensor extent C
+  --k <int>            Filter extent K
+  --r <int>            Filter extent R
+  --s <int>            Filter extent S
+
+  --alpha <float>      Epilogue scalar alpha
+  --beta <float>       Epilogue scalar beta
+
+  --ref-check          If set (true), reference check on the host is computed
+  --perf-check         If set (true), performance is measured.
+  --benchmark          If set (true), performance benchmarking on several layers and batch-size.
+  --iterations <int>   Number of profiling iterations to perform.
+  --save-workspace     If set, workspace is written to a text file.
+  --tag <string>       String to replicate across the first column in the results table
+
+
+
+Examples:
+
+$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1
+
+$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop  --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check
+```
+
+*Note*, this example assumes all tensors are 128b aligned and in format _NHWC_. Consequently, dimension
+_C_ must be divisible by 32 for activations, filters, and output.
+
+If the option `--benchmark` is passed, several layers from ResNet50 are profiled for various batch sizes.
+This sample output was computed on an NVIDIA RTX 2080 compiled with CUDA 10.2.
+
+```bash
+build$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop --benchmark
+```
+
+Convolution can also be run by the CUTLASS Profiler.
+
+
+# Copyright
+
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+
+```
+  Redistribution and use in source and binary forms, with or without modification, are permitted
+  provided that the following conditions are met:
+      * Redistributions of source code must retain the above copyright notice, this list of
+        conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright notice, this list of
+        conditions and the following disclaimer in the documentation and/or other materials
+        provided with the distribution.
+      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+        to endorse or promote products derived from this software without specific prior written
+        permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/profiler.md b/media/docs/profiler.md
index dd1f62a7c9..032848c6fa 100644
--- a/media/docs/profiler.md
+++ b/media/docs/profiler.md
@@ -109,16 +109,29 @@ About:
 Operations:
   --operation=<operation_name>               Specifies a particular operation to run or print the usage statement.
 
-     gemm                                    General matrix-matrix product. D = alpha * A*B + beta * C
+     gemm                                          General matrix-matrix product. D = alpha * A*B + beta * C
+     spgemm                                        Structured sparse GEMM. D = alpha * A*B + beta * C
+     conv2d                                        Conv2d operation. Output(Tensor4D) = alpha * Input(Tensor4D) * Filter(Tensor4D) + beta * Input(Tensor4D)
+     conv3d                                        Conv3d operation. Output(Tensor5D) = alpha * Input(Tensor5D) * Filter(Tensor5D) + beta * Input(Tensor5D)
 
 
 For more details about a particular operation, specify the operation name with --help.
 
 Example:
-  $ ./tools/profiler/cutlass_profiler --operation=Gemm --help
 
+  $ cutlass_profiler --operation=Gemm --help
+
+  $ cutlass_profiler --operation=Conv3d --help
+
+  $ cutlass_profiler --operation=Conv2d --help
+
+  $ cutlass_profiler --operation=SparseGemm --help
 ```
 
+# GEMM
+
+The CUTLASS Profiler is capable of executing each GEMM kernel.
+
 ## GEMM Arguments
 
 The complete set of arguments available to each operation may be viewed by specifying the operation name
@@ -189,7 +202,7 @@ Test your changes to gemm kernels with a quick functional test and save results
    --providers=cutlass --output=functional-test.csv
 ```
 
-## Example SGEMM
+## Example CUDA Core GEMM Operation (SGEMM)
 
 Example command line for profiling SGEMM kernels is as follows:
 ```bash
@@ -226,7 +239,7 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
 Note, the arguments which appear in the output may be used as command line parameters for subsequent invocations.
 
 
-## Example Tensor Core Operations
+## Example Tensor Core GEMM Operations (S16816GEMM)
 
 To execute kernels targeting Tensor Core operations, supply the flag `--op_class=tensorop` in the command line.
 
@@ -293,6 +306,158 @@ $ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sgemm_128x128_nn
                                     --tags=cutlass:2.2,date:2020-06-08
 ```  
 
+# Convolution
+
+The CUTLASS Profiler is capable of executing 2-D and 3-D convolution problems for forwards and backwards
+oeprator variants.
+
+The CUTLASS Profiler can be built with cuDNN enabled to use as a reference implementation. If CMake detects
+the cuDNN library available in the system, it is included as a dependency. This may be explicitly overridden
+with CMake flag `CUTLASS_ENABLE_CUDNN`. 
+
+```bash
+$ cmake .. -DCUTLASS_LIBRARY_OPERATIONS=conv2d -DCUTLASS_ENABLE_CUDNN=OFF
+...
+$ make -j16 cutlass_profiler
+```
+
+
+## Convolution Arguments
+
+```bash
+$ ./tools/profiler/cutlass_profiler --help --operation=Conv2d
+
+Conv2d
+
+  [enum]      --conv_kind                                       Convolutional operator (fprop, dgrad, wgrad)
+  [int]       --n,--input_n                                     Input N dimension of the Conv2d problem space
+  [int]       --h,--input_h                                     Input H dimension of the Conv2d problem space
+  [int]       --w,--input_w                                     Input W dimension of the Conv2d problem space
+  [int]       --c,--input_c                                     Input C dimension of the Conv2d problem space
+  [int]       --k,--filter_k                                    Filter K dimension of the Conv2d problem space
+  [int]       --r,--filter_r                                    Filter R dimension of the Conv2d problem space
+  [int]       --s,--filter_s                                    Filter S dimension of the Conv2d problem space
+  [int]       --p,--output_p                                    Output P dimension of the Conv2d problem space
+  [int]       --q,--output_q                                    Output Q dimension of the Conv2d problem space
+  [int]       --pad_h                                           Padding in H direction
+  [int]       --pad_w                                           Padding in W direction
+  [int]       --stride_h                                        Stride in H direction
+  [int]       --stride_w                                        Stride in W direction
+  [int]       --dilation_h                                      Dilation in H direction
+  [int]       --dilation_w                                      Dilation in W direction
+  [tensor]    --Activation                                      Tensor storing the Activation operand
+  [tensor]    --Filter                                          Tensor storing the Filter operand
+  [tensor]    --Output                                          Tensor storing the Output operand
+  [enum]      --conv_mode                                       Convolution filter mode (conv, cross)
+  [enum]      --iterator_algorithm,--iterator_algo              Convolution iterator algorithm (analytic, optimized)
+  [scalar]    --alpha,--epilogue::alpha                         Epilogue scalar alpha
+  [scalar]    --beta,--epilogue::beta                           Epilogue scalar beta
+  [enum]      --split_k_mode,--split-k-mode                     SplitK mode for serial or parallel reduction (serial, parallel)
+  [int]       --split_k_slices,--split-k-slices                 Number of partitions of K dimension
+  [enum]      --eq_gemm_provider,--eq-gemm-provider             Enable profiling equivalent gemm by the following providers (cutlass)
+  [enum]      --op_class,--opcode-class                         Class of math instruction (simt, tensorop, wmmatensorop, wmma)
+  [enum]      --accum,--accumulator-type                        Math instruction accumulator data type
+  [int]       --cta_m,--threadblock-shape::m                    Threadblock shape in the M dimension
+  [int]       --cta_n,--threadblock-shape::n                    Threadblock shape in the N dimension
+  [int]       --cta_k,--threadblock-shape::k                    Threadblock shape in the K dimension
+  [int]       --stages,--threadblock-stages                     Number of stages of threadblock-scoped matrix multiply
+  [int]       --warps_m,--warp-count::m                         Number of warps within threadblock along the M dimension
+  [int]       --warps_n,--warp-count::n                         Number of warps within threadblock along the N dimension
+  [int]       --warps_k,--warp-count::k                         Number of warps within threadblock along the K dimension
+  [int]       --inst_m,--instruction-shape::m                   Math instruction shape in the M dimension
+  [int]       --inst_n,--instruction-shape::n                   Math instruction shape in the N dimension
+  [int]       --inst_k,--instruction-shape::k                   Math instruction shape in the K dimension
+  [int]       --min_cc,--minimum-compute-capability             Minimum device compute capability
+  [int]       --max_cc,--maximum-compute-capability             Maximum device compute capability
+
+Examples:
+
+Profile a particular convolution (specify all the convolution parameters):
+
+ $ cutlass_profiler --operation=Conv2d --Activation=f16:nhwc   \
+  --Filter=f16:nhwc --Output=f16 --accumulator-type=f32        \
+  --n=32 --h=14 --w=14 --c=8 --k=64 --r=3 --s=3                \
+  --pad_h=1 --pad_w=1                                          \
+  --stride::h=1 --stride::w=1 --dilation::h=1 --dilation::w=1
+
+```
+
+## Example CUDA Core Convolution Operation (SFPROP)
+
+Example command line for profiling Convolution kernels is as follows:
+
+```bash
+$ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc  --verification-providers=device --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3
+
+
+=============================
+  Problem ID: 1
+
+        Provider: CUTLASS
+   OperationKind: conv2d
+       Operation: cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
+
+          Status: Success
+    Verification: ON
+     Disposition: Passed
+
+reference_device: Passed
+
+       Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1  \
+                  --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f32:nhwc --Filter=f32:nhwc --Output=f32:nhwc  \
+                  --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1  \
+                  --eq_gemm_provider=none --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4  \
+                  --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024
+
+           Bytes: 2055798784  bytes
+           FLOPs: 118482796544  flops
+
+         Runtime: 8.13237  ms
+          Memory: 235.431 GiB/s
+
+            Math: 14569.3 GFLOP/s
+
+```
+
+## Example Tensor Core Convolution Operation (S16816FPROP)
+
+Example command line for profiling Convolution kernels is as follows:
+
+```bash
+$ ./tools/profiler/cutlass_profiler --kernels=cutlass_tensorop_s16816fprop_optimized_f16_128x128_64x4_nhwc  --verification-providers=device --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 
+
+
+
+=============================
+  Problem ID: 1
+
+        Provider: CUTLASS
+   OperationKind: conv2d
+       Operation: cutlass_tensorop_s16816fprop_optimized_f16_128x128_64x4_nhwc
+
+          Status: Success
+    Verification: ON
+     Disposition: Passed
+
+reference_device: Passed
+
+       Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1  \
+                  --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f16:nhwc --Filter=f16:nhwc --Output=f32:nhwc  \
+                  --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1  \
+                  --eq_gemm_provider=none --op_class=tensorop --accum=f32 --cta_m=128 --cta_n=128 --cta_k=64 --stages=4  \
+                  --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=16 --min_cc=80 --max_cc=1024
+
+           Bytes: 1130659840  bytes
+           FLOPs: 118482796544  flops
+
+         Runtime: 0.945071  ms
+          Memory: 1114.21 GiB/s
+
+            Math: 125369 GFLOP/s
+
+
+```
+
 # Copyright
 
 Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md
index 427fe13c66..425d927003 100644
--- a/media/docs/quickstart.md
+++ b/media/docs/quickstart.md
@@ -7,11 +7,15 @@
 ## Prerequisites
 
 CUTLASS requires:
-- NVIDIA CUDA Toolkit (9.2 or later required, [11.0](https://developer.nvidia.com/cuda-toolkit) recommended)
+- NVIDIA CUDA Toolkit (9.2 or later required, [11.1](https://developer.nvidia.com/cuda-toolkit) recommended)
 - CMake 3.12+
 - host compiler supporting C++11 or greater (g++ 7.3.0 or Microsoft Visual Studio 2015 recommended)
 - Python 3.6+
 
+CUTLASS may be optionally compiled and linked with
+- cuBLAS
+- cuDNN v7.6 or later
+
 ## Initial build steps
 
 Construct a build directory and run CMake.
@@ -31,6 +35,23 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BU
 
 This reduces overall compilation time by excluding unit tests and enabling the unit build.
 
+You may reduce build times by compiling only certain operations by setting the `CUTLASS_LIBRARY_OPERATIONS` flag as shown below,
+executed from an empty `build/` directory. This only compiles 2-D convolution kernels.
+
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_OPERATIONS=conv2d
+```
+
+You may also filter kernels by name by supplying a filter string with flag `CUTLASS_LIBRARY_KERNELS`. 
+
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_KERNELS=s16816gemm,s16816fprop*128x128
+```
+
+You may explicitly exclude cuBLAS and cuDNN as dependencies with the following CMake flags.
+- `-DCUTLASS_ENABLE_CUBLAS=OFF`
+- `-DCUTLASS_ENABLE_CUDNN=OFF`
+
 
 ## Build and run the CUTLASS Profiler
 
@@ -39,7 +60,7 @@ From the `build/` directory created above, compile the the CUTLASS Profiler.
 $ make cutlass_profiler -j12
 ```
 
-Then execute the CUTLASS Profiler for a set of problem sizes.
+Then execute the CUTLASS Profiler computing GEMM, execute the following command.
 ```bash
 $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=4352 --n=4096 --k=4096
 
@@ -66,6 +87,45 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=4352 --n=4096 --k=4096
         Math: 13854.9 GFLOP/s
 ```
 
+To execute the CUTLASS Profiler for Convolution, run the following example.
+```bash
+$ ./tools/profiler/cutlass_profiler --kernels=s1688fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --pad_h=1 --pad_w=1
+```
+
+To execute all CUTLASS 2-D convolution operators, execute the following.
+```bash
+$ ./tools/profiler/cutlass_profiler --operation=conv2d--n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3
+
+
+=============================
+  Problem ID: 1
+
+        Provider: CUTLASS
+   OperationKind: conv2d
+       Operation: cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
+
+          Status: Success
+    Verification: ON
+     Disposition: Passed
+
+reference_device: Passed
+
+       Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1  \
+                  --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f32:nhwc --Filter=f32:nhwc --Output=f32:nhwc  \
+                  --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1  \
+                  --eq_gemm_provider=none --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4  \
+                  --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024
+
+           Bytes: 2055798784  bytes
+           FLOPs: 118482796544  flops
+
+         Runtime: 8.13237  ms
+          Memory: 235.431 GiB/s
+
+            Math: 14569.3 GFLOP/s
+
+```
+
 See [documentation for the CUTLASS Profiler](profiler.md) for more details.
 
 ## Build and run CUTLASS Unit Tests
diff --git a/media/images/conv2d-fprop-int4.png b/media/images/conv2d-fprop-int4.png
new file mode 100644
index 0000000000..375c0d752f
Binary files /dev/null and b/media/images/conv2d-fprop-int4.png differ
diff --git a/media/images/ldmatrix-8x128bx4.png b/media/images/ldmatrix-8x128bx4.png
new file mode 100644
index 0000000000..44d50d9ae8
Binary files /dev/null and b/media/images/ldmatrix-8x128bx4.png differ
diff --git a/media/images/ldmatrix-tensorop-32x32x32.png b/media/images/ldmatrix-tensorop-32x32x32.png
new file mode 100644
index 0000000000..7acc9723f9
Binary files /dev/null and b/media/images/ldmatrix-tensorop-32x32x32.png differ
diff --git a/media/images/mma-8x8x32.png b/media/images/mma-8x8x32.png
new file mode 100644
index 0000000000..ff65d83222
Binary files /dev/null and b/media/images/mma-8x8x32.png differ
diff --git a/media/images/tensor-op-permuted-smem-layout-TN-k0.png b/media/images/tensor-op-permuted-smem-layout-TN-k0.png
new file mode 100644
index 0000000000..b9ab8cb313
Binary files /dev/null and b/media/images/tensor-op-permuted-smem-layout-TN-k0.png differ
diff --git a/media/images/tensor-op-permuted-smem-layout-TN-k1.png b/media/images/tensor-op-permuted-smem-layout-TN-k1.png
new file mode 100644
index 0000000000..ea7d8b3be9
Binary files /dev/null and b/media/images/tensor-op-permuted-smem-layout-TN-k1.png differ
diff --git a/media/images/tensor-op-permuted-smem-layout-TN.png b/media/images/tensor-op-permuted-smem-layout-TN.png
new file mode 100644
index 0000000000..5bb4fe47b3
Binary files /dev/null and b/media/images/tensor-op-permuted-smem-layout-TN.png differ
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 35994ba6d8..436990fd66 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -21,3 +21,4 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 add_subdirectory(unit)
+
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 52368a346a..d57570ce6c 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -49,19 +49,14 @@ target_link_libraries(
   cutlass_test_unit_infra
   )
 
-set(CUTLASS_INSTALL_TESTS ON CACHE BOOL "Install test executables")
-set(CUTLASS_TEST_EXECUTION_ENVIRONMENT "" CACHE BOOL "Environment in which to invoke unit test executables")
-  
-function(cutlass_test_unit_add_executable)
+function(cutlass_test_unit_add_executable NAME)
 
   set(options)
   set(oneValueArgs)
   set(multiValueArgs)
   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  cutlass_add_executable(${__UNPARSED_ARGUMENTS})
-
-  list(GET __UNPARSED_ARGUMENTS 0 NAME)
+  cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS})
 
   target_link_libraries(
     ${NAME}
@@ -72,25 +67,13 @@ function(cutlass_test_unit_add_executable)
 
   string(REGEX REPLACE cutlass_ "" NAME_STEM ${NAME})
 
-  add_test(c${NAME_STEM} ${NAME})
+  set(CUTLASS_TEST_UNIT_TEST_COMMAND_OPTIONS --gtest_output=xml:${NAME_STEM}.gtest.xml)
 
-  add_custom_target(
-    ${NAME_STEM}
-    COMMAND
-    ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${NAME}>
-  DEPENDS
-    ${NAME}
+  cutlass_add_executable_tests(
+    ${NAME_STEM} ${NAME} 
+    TEST_COMMAND_OPTIONS CUTLASS_TEST_UNIT_TEST_COMMAND_OPTIONS
     )
 
-  if (CUTLASS_INSTALL_TESTS)
-
-    install(
-      TARGETS ${NAME}
-      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-      )
-
-  endif()
-
 endfunction()
 
 add_custom_target(cutlass_test_unit)
@@ -99,6 +82,7 @@ add_custom_target(test_unit)
 set(SUBDIRS
   core
   gemm
+  conv
   layout
   transform
   epilogue
diff --git a/test/unit/conv/CMakeLists.txt b/test/unit/conv/CMakeLists.txt
new file mode 100644
index 0000000000..a50a58f59e
--- /dev/null
+++ b/test/unit/conv/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+add_custom_target(cutlass_test_unit_conv)
+add_custom_target(test_unit_conv)
+
+set(CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED ON CACHE BOOL
+  "Enable/Disable convolution device reference for conv unit tests.")
+
+if(CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED)
+  message(STATUS "Enable device reference verification in conv unit tests")
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED=1)
+endif()
+
+foreach(SUBDIR
+  device
+  )
+
+  add_subdirectory(${SUBDIR})
+  add_dependencies(cutlass_test_unit_conv cutlass_test_unit_conv_${SUBDIR})
+  add_dependencies(test_unit_conv test_unit_conv_${SUBDIR})
+
+endforeach()
diff --git a/test/unit/conv/device/CMakeLists.txt b/test/unit/conv/device/CMakeLists.txt
new file mode 100644
index 0000000000..ce907e0d58
--- /dev/null
+++ b/test/unit/conv/device/CMakeLists.txt
@@ -0,0 +1,148 @@
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
+ add_custom_target(
+  cutlass_test_unit_conv_device
+  DEPENDS
+  cutlass_test_unit_conv_device_simt
+  cutlass_test_unit_conv_device_tensorop_f32_sm70
+  cutlass_test_unit_conv_device_tensorop_f32_sm75
+  cutlass_test_unit_conv_device_tensorop_f16_sm80
+  cutlass_test_unit_conv_device_tensorop_f32_sm80
+  cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80
+  cutlass_test_unit_conv_device_tensorop_s32
+  cutlass_test_unit_conv_device_tensorop_s32_interleaved
+)
+
+ add_custom_target(
+  test_unit_conv_device
+  DEPENDS
+  test_unit_conv_device_simt
+  test_unit_conv_device_tensorop_f32_sm70
+  test_unit_conv_device_tensorop_f32_sm75
+  test_unit_conv_device_tensorop_f16_sm80
+  test_unit_conv_device_tensorop_f32_sm80
+  test_unit_conv_device_tensorop_f32_tf32_sm80
+  test_unit_conv_device_tensorop_s32
+  test_unit_conv_device_tensorop_s32_interleaved
+)
+
+#
+# OpClassSimt (CUDA cores)
+#
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv_device_simt
+  
+  # F32  
+  conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
+
+  conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+  conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+  conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+
+  # CF32
+  conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+  conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+  conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu 
+  
+  conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+  conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+  conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+)
+
+#
+# OpClassTensorOp (Tensor cores)
+#
+
+# Conv - F16 input, F32 output, F32 accumulation
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv_device_tensorop_f32_sm70
+  
+  conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+  conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+  conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+)
+
+# Conv2d - F16 input, F32 output, F32 accumulation - SM75
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv_device_tensorop_f32_sm75
+
+  conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+  conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+  conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+)
+
+# Conv2d - F16 input, F16 output, F16 accumulation 
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv_device_tensorop_f16_sm80
+
+  conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+  conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu 
+  conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu 
+)
+
+# Conv2d - F16 input, F32 output, F32 accumulation
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv_device_tensorop_f32_sm80
+
+
+  conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+  conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+  conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+
+  conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
+  conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+)
+
+# Conv2d - TF32 input, F32 output, F32 accumulation
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80
+
+  conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+  conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+  conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+
+  conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+  conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+  conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+)
+
+# Conv2d - S8 input, S32 output, S32 accumulation
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv_device_tensorop_s32
+
+  conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
+  conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
+  conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
+  conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
+)
+
+# Conv2d - S8 interleaved input, S8 interleaved output, S32 accumulation
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv_device_tensorop_s32_interleaved
+
+  conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
+  conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
+  conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
+  conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
+)
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
new file mode 100644
index 0000000000..4d500d9783
--- /dev/null
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -0,0 +1,130 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM50_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  64x64_8x2_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM50_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x64_8x2_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
new file mode 100644
index 0000000000..cc36edc75e
--- /dev/null
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -0,0 +1,314 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  64x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  128x128_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
new file mode 100644
index 0000000000..aab0d34e49
--- /dev/null
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -0,0 +1,123 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+    
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16,
+  128x128_64x3_64x64x64) {
+ 
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+  
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16,
+  128x128_64x3_64x64x64) {
+ 
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+  
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
new file mode 100644
index 0000000000..bc9ee6e9d7
--- /dev/null
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -0,0 +1,118 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM70_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm70,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM70_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm70,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM70_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
new file mode 100644
index 0000000000..7417f92197
--- /dev/null
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -0,0 +1,159 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM75_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM75_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM75_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..01f51a2cc4
--- /dev/null
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,286 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kStrided
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride,
+  128x128_32x4_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride,
+  128x128_64x4_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
new file mode 100644
index 0000000000..7682a319fe
--- /dev/null
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -0,0 +1,323 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  64x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  test::conv::device::Conv2dProblemVector user_size;
+
+  user_size.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, 4},   // input size  (NHWC)
+      {8, 1, 1, 4},   // filter size (KRSC)
+      {0, 0, 0, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>(user_size));
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..48c6ddb043
--- /dev/null
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
new file mode 100644
index 0000000000..b3b66a9de1
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x64_8x2_32x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>,
+    2,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x128_8x2_16x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<32, 128, 8>,
+    cutlass::gemm::GemmShape<16, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>,
+    2,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  64x64_8x2_32x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>,
+    2,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM50_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x64_8x2_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>,
+    2,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
new file mode 100644
index 0000000000..25e3ee0d5f
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -0,0 +1,397 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  64x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  128x128_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  128x128_8x5_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    5,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  64x64_8x3_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
new file mode 100644
index 0000000000..e151f5a78f
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -0,0 +1,121 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16,
+  128x128_64x3_64x64x64) {
+ 
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+  
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16,
+  128x128_64x3_64x64x64) {
+ 
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+  
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..4c8102a503
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+    
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
+  128x128_64x3_64x64x64) {
+ 
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+  
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
+  128x128_64x3_64x64x64) {
+ 
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+  
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
new file mode 100644
index 0000000000..15f5585839
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -0,0 +1,81 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED)
+
+TEST(SM70_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm70,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM70_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
new file mode 100644
index 0000000000..b54359f177
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -0,0 +1,121 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+    
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..51d2b942f4
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#if 0
+TEST(SM80_Device_Conv2d_Fprop_Precomputed_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
new file mode 100644
index 0000000000..820f0fb89f
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
@@ -0,0 +1,82 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x2_64x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
new file mode 100644
index 0000000000..746e7d7b0b
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -0,0 +1,321 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  64x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  test::conv::device::Conv2dProblemVector user_size;
+
+  user_size.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, 4},   // input size  (NHWC)
+      {8, 1, 1, 4},   // filter size (KRSC)
+      {0, 0, 0, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>(user_size));
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
new file mode 100644
index 0000000000..7255eac644
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
@@ -0,0 +1,520 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed_interleaved.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  128x128_128x2_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  256x128_128x2_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  128x256_128x2_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 256, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  256x64_128x2_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 64, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  64x256_128x2_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 256, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  64x128_128x2_32x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 128, 128>,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  128x128_128x2_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  256x128_128x2_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  128x256_128x2_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 256, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  256x64_128x2_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 64, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  64x256_128x2_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 256, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  64x128_128x2_32x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 128, 128>,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2, 
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
new file mode 100644
index 0000000000..7e9bb9060b
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
@@ -0,0 +1,521 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed_interleaved.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  128x128_128x3_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  256x128_128x3_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  128x256_128x3_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  256x64_128x3_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  64x256_128x3_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  64x128_128x4_32x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 128>,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    4,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  128x128_128x3_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  256x128_128x3_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  128x256_128x3_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  256x64_128x3_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  64x256_128x3_64x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
+  64x128_128x4_32x64x128) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<64>,
+    ElementB, cutlass::layout::TensorCxRSKx<64>,
+    ElementC, cutlass::layout::TensorNCxHWx<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 128>,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    4,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 64>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
new file mode 100644
index 0000000000..5426003779
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -0,0 +1,119 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+    
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
new file mode 100644
index 0000000000..d0ba7a5047
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
@@ -0,0 +1,121 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+    
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::int4b_t;
+  using ElementB           = cutlass::int4b_t;
+  using ElementC           = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
new file mode 100644
index 0000000000..fbab373165
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
@@ -0,0 +1,679 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed_interleaved.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x128_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  256x128_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x256_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  256x64_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 64, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x256_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x64_64x2_64x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 64, 64>,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x128_64x2_32x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 128, 64>,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x64_64x2_32x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x128_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  256x128_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x256_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  256x64_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 64, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x256_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x64_64x2_64x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 64, 64>,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x128_64x2_32x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 128, 64>,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2, 
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x64_64x2_32x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
new file mode 100644
index 0000000000..e8b7c44fe2
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
@@ -0,0 +1,680 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed_interleaved.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  256x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x256_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  256x64_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x256_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x64_64x4_64x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 64>,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    4,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x128_64x4_32x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 64>,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    4,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x64_64x6_32x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    6,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  256x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x256_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  256x64_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x256_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  128x64_64x4_64x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 64>,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    4,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x128_64x4_32x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 64>,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    4,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
+  64x64_64x6_32x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    6,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d<Conv2dFprop, 32>()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
new file mode 100644
index 0000000000..e5146be328
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -0,0 +1,119 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+    
+TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32,
+  128x128_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32,
+  128x128_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
new file mode 100644
index 0000000000..4cfdd3722d
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
@@ -0,0 +1,120 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+////////////////////////////////////////////////////////////////////////////////
+    
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute     = float;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..c1a1f647a3
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,81 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+ 
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_problems.h b/test/unit/conv/device/conv2d_problems.h
new file mode 100644
index 0000000000..74b43e11c7
--- /dev/null
+++ b/test/unit/conv/device/conv2d_problems.h
@@ -0,0 +1,520 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed sizes for Conv2d problem
+*/
+#pragma once
+
+#include <vector>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#define CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 1
+
+namespace test {
+namespace conv {
+namespace device {
+
+using Conv2dProblemVector = std::vector<cutlass::conv::Conv2dProblemSize>;
+
+//
+// Structures to prune items from Conv2dProblemVector
+//
+// Specification template for pruning items for convolution problem lists
+template <typename T> struct Specification
+{
+  virtual ~Specification() = default;
+  virtual bool is_satisfied(T item) const = 0;
+};
+
+// input size  (NHWC) specification
+struct InputSizeSpecification : Specification<cutlass::conv::Conv2dProblemSize>
+{
+  cutlass::Tensor4DCoord input_size;
+
+  InputSizeSpecification(cutlass::Tensor4DCoord input_size_) : input_size(input_size_) {}
+
+  bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override {
+    return ((input_size.n() == item.N) && (input_size.h() == item.H) && (input_size.w() == item.W) && (input_size.c() == item.C));
+  }
+};
+
+// stride (stride_h, stride_w) specification
+struct StrideSpecification : Specification<cutlass::conv::Conv2dProblemSize>
+{
+  cutlass::MatrixCoord stride;
+
+  StrideSpecification(cutlass::MatrixCoord stride_) : stride(stride_) {}
+
+  bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override {
+    return ((stride.row() == item.stride_h) && (stride.column() == item.stride_h));
+  }
+};
+
+// channel (C,K) specification, must be multiple of minimum channel
+struct ChannelDivisibilitySpecification : Specification<cutlass::conv::Conv2dProblemSize>
+{
+  int channel_multiple;
+
+  ChannelDivisibilitySpecification(int channel_multiple_) : channel_multiple(channel_multiple_) {}
+
+  bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override {
+    return ((item.K % channel_multiple == 0) && (item.C % channel_multiple == 0));
+  }
+};
+
+//
+// Pruning function for items from Conv2dProblemVector based on a Specification
+//
+inline Conv2dProblemVector prune(Conv2dProblemVector const &items,
+                           Specification<cutlass::conv::Conv2dProblemSize> const &spec)
+{
+  Conv2dProblemVector pruned_list;
+
+  for (auto& p : items)
+    if (spec.is_satisfied(p))
+      pruned_list.push_back(p);
+  return pruned_list;
+}
+
+
+////////////////////////////////////////////////////////////////////////////
+/// Structure TestbedConv2dProblemSizes initializes and holds conv default and 
+/// important network sizes
+////////////////////////////////////////////////////////////////////////////
+struct TestbedConv2dProblemSizes {
+
+  //
+  // Data members
+  //
+  int minimum_channel_size;
+
+  Conv2dProblemVector conv2d_default_sizes;
+  Conv2dProblemVector conv2d_rigorous_sizes;
+  Conv2dProblemVector conv2d_resnet50_sizes;
+  Conv2dProblemVector conv2d_resnet50_sizes_perf;
+
+  //
+  // Methods
+  //
+  /// Default ctor
+  TestbedConv2dProblemSizes(int minimum_channel_size_ = 64): minimum_channel_size (minimum_channel_size_) { 
+    initialize_conv2d_default_sizes();
+    initialize_conv2d_rigorous_sizes();
+    initialize_conv2d_resnet50_sizes(conv2d_resnet50_sizes, 1 /*batch-size*/);
+
+    initialize_conv2d_resnet50_sizes(conv2d_resnet50_sizes_perf, 34 /*batch-size*/);
+    filter_all();
+  }
+
+  /// Eliminates some illegal cases
+  void filter_all() {
+
+    Conv2dProblemVector *problems_vectors[] = {
+      &conv2d_default_sizes,
+      &conv2d_rigorous_sizes,
+      &conv2d_resnet50_sizes,
+      &conv2d_resnet50_sizes_perf
+    };
+
+    for (Conv2dProblemVector *problems : problems_vectors) {
+      Conv2dProblemVector filtered;
+
+      for (cutlass::conv::Conv2dProblemSize const & problem : *problems) {
+        if (!(problem.C % minimum_channel_size)) {
+          filtered.push_back(problem);
+        }
+      }
+
+      *problems = filtered;
+    } 
+  }
+
+  // Add a few standard convolution problem sizes
+  void initialize_conv2d_default_sizes() {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    // Very Small input size (1x8x8xminimum_channel_size), filter size (3x3 - 7x7), stride (1,1)
+    // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 8, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 3, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, minimum_channel_size},  // input size  (NHWC)
+      {8, 4, 4, minimum_channel_size},  // filter size (KRSC)
+      {1, 1, 1, 1},                     // padding (pad_h, _, pad_w, _)
+      {1, 1},                           // stride (stride_h, stride_w)
+      {1, 1}                            // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 5, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 6, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 7, 7, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1) 
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 15, 19, 160},   // input size  (NHWC)
+      {224, 1, 1, 160},   // filter size (KRSC)
+      {0, 0, 0, 0},       // padding (pad_h, _, pad_w, _) 
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w) 
+    ));
+  
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 16, 16, 160},   // input size  (NHWC)
+      {224, 2, 3, 160},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _) 
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w) 
+    ));
+  
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 23, 21, 128},  // input size  (NHWC)
+      {224, 3, 3, 128},  // filter size (KRSC)
+      {1, 1, 1, 1},      // padding (pad_h, _, pad_w, _)
+      {1, 1},            // stride (stride_h, stride_w)
+      {1, 1}             // dilation (dilation_h, dilation_w)
+    ));
+  
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 29, 37, 160},      // input size  (NHWC)
+      {224, 5, 5, 160},      // filter size (KRSC)
+      {2, 2, 2, 2},          // padding (pad_h, _, pad_w, _)
+      {1, 1},                // stride (stride_h, stride_w)
+      {1, 1}                 // dilation (dilation_h, dilation_w)
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 15, 19, 32 + minimum_channel_size},     // input size  (NHWC)
+      {96, 3, 3, 32 + minimum_channel_size},      // filter size (KRSC)
+      {1, 1, 1, 1},                               // padding (pad_h, _, pad_w, _)
+      {1, 1},                                     // stride (stride_h, stride_w)
+      {1, 1}                                      // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 16, 16, 64 + minimum_channel_size},     // input size  (NHWC)
+      {96, 3, 3, 64 + minimum_channel_size},      // filter size (KRSC)
+      {1, 1, 1, 1},                               // padding (pad_h, _, pad_w, _)
+      {1, 1},                                     // stride (stride_h, stride_w)
+      {1, 1}                                      // dilation (dilation_h, dilation_w)
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size (1x16x16x128), filter size (1x1, 3,x3, 5x5), stride (2, 2)  
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 19, 37, 160},     // input size  (NHWC)
+      {224, 3, 3, 160},     // filter size (KRSC)
+      {1, 1, 1, 1},         // padding (pad_h, _, pad_w, _)
+      {2, 2},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+  
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 16, 16, 288},   // input size  (NHWC)
+      {160, 5, 5, 288},   // filter size (KRSC)
+      {2, 2, 2, 2},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+
+    /////////////////////////////////////////////////////////////////////////////
+    // Additional input size 
+    /////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 28, 28, 256},  // input size  (NHWC)
+      {256, 2, 2, 256},  // filter size (KRSC)
+      {0, 0, 0, 0},      // padding (pad_h, _, pad_w, _)
+      {2, 2},            // stride (stride_h, stride_w)
+      {1, 1}             // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {32, 32, 32, 32},  // input size  (NHWC)
+      {32, 1, 1, 32},    // filter size (KRSC)
+      {0, 0, 0, 0},      // padding (pad_h, _, pad_w, _)
+      {1, 1},            // stride (stride_h, stride_w)
+      {1, 1}             // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {4, 3, 3, 128},     // input size  (NHWC)
+      {256, 3, 3, 128},   // filter size (KRSC)
+      {0, 0, 0, 0},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1},             // dilation (dilation_h, dilation_w)
+      {4, 3, 3, 256}      // output size (NPQK)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {4, 1, 1, 256},     // input size  (NHWC)
+      {328, 3, 3, 256},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1},             // dilation (dilation_h, dilation_w)
+      {4, 1, 1, 328}      // output size (NPQK)
+    ));
+  
+  }
+
+
+  // Add a few large and rigorous convolution problem sizes
+  void initialize_conv2d_rigorous_sizes() {
+
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED                  
+  conv2d_rigorous_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 124, 224, 96},  // input size  (NHWC)
+    {24, 7, 7, 96},     // filter size (KRSC)
+    {1, 229, 129, 32}   // output size (NPQK)
+  ));
+
+  conv2d_rigorous_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 233, 35, 48},                     // input size  (NHWC)
+    {24, 7, 5, 48},                       // filter size (KRSC)
+    {1, 233, 35, 24}                     // output size (NPQK)
+  ));
+
+#endif 
+
+  }
+
+
+  // Add resent50 layers to unit testing sizes 
+  void initialize_conv2d_resnet50_sizes(Conv2dProblemVector &conv2d_problem_vector, int batch_size = 1){
+
+#if 0 // Resnet50 first layer (layer_id = 0) with channel = 3 is not supported in cutlass
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(   
+      [1, 224, 224, 3],           // input size (NHWC)
+      [64, 7, 7, 3],              // filter size (KRSC)
+      [3, 3, 3, 3],               // padding (pad_h, _, pad_w, _)
+      [2, 2],                     // stride (stride_h, stride_w)
+      [1, 1],                     // dilation (dilation_h, dilation_w)
+    ));
+#endif
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 64},   // input size (NHWC)
+      {256, 1, 1, 64},            // filter size (KRSC)
+      {0, 0, 0, 0},               // padding (pad_h, _, pad_w, _)
+      {1, 1},                     // stride (stride_h, stride_w)
+      {1, 1}                      // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 64},   // input size (NHWC)
+      {64, 1, 1, 64},             // filter size (KRSC)
+      {0, 0, 0, 0},               // padding (pad_h, _, pad_w, _)
+      {1, 1},                     // stride (stride_h, stride_w)
+      {1, 1}                      // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 64},    // input size (NHWC)
+      {64, 3, 3, 64},             // filter size (KRSC)
+      {1, 1, 1, 1},               // padding (pad_h, _, pad_w, _)
+      {1, 1},                     // stride (stride_h, stride_w)
+      {1, 1}                      // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 256},   // input size (NHWC)
+      {64, 1, 1, 256},             // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+   conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 256},   // input size (NHWC)
+      {512, 1, 1, 256},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 256},   // input size (NHWC)
+      {128, 1, 1, 256},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 128},   // input size (NHWC)
+      {128, 3, 3, 128},            // filter size (KRSC)
+      {1, 1, 1, 1},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 128},   // input size (NHWC)
+      {512, 1, 1, 128},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 512},   // input size (NHWC)
+      {128, 1, 1, 512},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+ 
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 512},   // input size (NHWC)
+      {1024, 1, 1, 512},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+        
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 512},   // input size (NHWC)
+      {256, 1, 1, 512},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 256},   // input size (NHWC)
+      {256, 3, 3, 256},            // filter size (KRSC)
+      {1, 1, 1, 1},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 256},   // input size (NHWC)
+      {1024, 1, 1, 256},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 1024},   // input size (NHWC)
+      {256, 1, 1, 1024},            // filter size (KRSC)
+      {0, 0, 0, 0},                 // padding (pad_h, _, pad_w, _)
+      {1, 1},                       // stride (stride_h, stride_w)
+      {1, 1}                        // dilation (dilation_h, dilation_w)
+    ));
+
+     conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 1024},   // input size (NHWC)
+      {2048, 1, 1, 1024},           // filter size (KRSC)
+      {0, 0, 0, 0},                 // padding (pad_h, _, pad_w, _)
+      {2, 2},                       // stride (stride_h, stride_w)
+      {1, 1}                        // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 1024},   // input size (NHWC)
+      {512, 1, 1, 1024},            // filter size (KRSC)
+      {0, 0, 0, 0},                 // padding (pad_h, _, pad_w, _)
+      {2, 2},                       // stride (stride_h, stride_w)
+      {1, 1}                        // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 7, 7, 512},     // input size (NHWC)
+      {512, 3, 3, 512},            // filter size (KRSC)
+      {1, 1, 1, 1},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 7, 7, 512},     // input size (NHWC)
+      {2048, 1, 1, 512},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 7, 7, 2048},    // input size (NHWC)
+      {512, 1, 1, 2048},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+ }
+
+};
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h
new file mode 100644
index 0000000000..14bdd9bf13
--- /dev/null
+++ b/test/unit/conv/device/conv2d_testbed.h
@@ -0,0 +1,558 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "conv2d_problems.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+template <typename Conv2d>
+class TestbedConv2d {
+public:
+
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+
+  /// Reduction kernel
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator, 
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
+    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+    EpilogueOutputOp,
+    ReductionOp
+  >;
+
+  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
+
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+public:
+
+  TestbedConv2d(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        scope = 3;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+        
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Conv2d::ImplicitGemmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+
+		// Waive test if CUDA device is insufficient
+		if (!sufficient()) {
+			return true;
+		}
+
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv2d conv2d_op;
+
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode
+    );
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // conv2d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // conv2d output is written to workspace in global memory
+      conv2d_args.ref_D.reset(reinterpret_cast<ElementC*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)}; 
+      // update conv2d operator arguments
+      status = conv2d_op.update(conv2d_args, workspace.get());
+    }
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+  
+    // run conv2d operator
+    status = conv2d_op();
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // configure parallel reduction operator 
+      ReductionDevice reduction_op;
+
+      typename ReductionDevice::Arguments reduction_args(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
+        problem_size.split_k_slices,
+        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
+        {reinterpret_cast<ElementAccumulator*> (workspace.get()), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
+        {tensor_D_computed.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
+        {tensor_C.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
+        {alpha, beta} // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+      );
+
+      status = reduction_op.initialize(reduction_args, nullptr);
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+
+      // run prallel reduction kernel
+      status = reduction_op();
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+    }
+    bool passed = false;
+    
+    tensor_D_computed.sync_host();
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator 
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha, 
+      beta);
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+    
+#else 
+
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha, 
+      beta);
+
+#endif
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(), 
+      tensor_D_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) 
+        << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C 
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C 
+        << "_padding_" 
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w 
+        << "_stride_"  
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w 
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv2d::ThreadblockShape::kM << "x"  
+        << Conv2d::ThreadblockShape::kN << "x"  
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"  
+        << Conv2d::WarpShape::kN << "x"  
+        << Conv2d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << tensor_D_reference.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D_computed.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm>
+bool TestAllConv2d(
+  const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2d<ImplicitGemm> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Procedurally disable certain cases
+      //
+  
+      // CUTLASS DGRAD's unity stride specialization only support stride {1, 1} 
+      if ((ImplicitGemm::kConvolutionalOperator == 
+            cutlass::conv::Operator::kDgrad) && 
+          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          continue;
+        }
+      }
+
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+    cutlass::conv::SplitKMode::kParallel,
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h
new file mode 100644
index 0000000000..cb4ecc7056
--- /dev/null
+++ b/test/unit/conv/device/conv2d_testbed_interleaved.h
@@ -0,0 +1,534 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "conv2d_problems.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/host_reorder.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+template <typename Conv2d, int InterleavedK>
+class InterleavedTestbedConv2d {
+public:
+
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+
+  /// Reduction kernel
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator, 
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
+    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+    EpilogueOutputOp,
+    ReductionOp
+  >;
+
+  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
+
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B_reordered;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+public:
+
+  InterleavedTestbedConv2d(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        scope = 3;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+        
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_B_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+
+    cutlass::reorder_convK<InterleavedK>(
+        tensor_B_reordered.host_ref(), tensor_B.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size));
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_B_reordered.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv2d conv2d_op;
+
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B_reordered.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode
+    );
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+
+    // conv2d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // conv2d output is written to workspace in global memory
+      conv2d_args.ref_D.reset(reinterpret_cast<ElementC*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)}; 
+      // update conv2d operator arguments
+      status = conv2d_op.update(conv2d_args, workspace.get());
+    }
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+  
+    // run conv2d operator
+    status = conv2d_op();
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // configure parallel reduction operator 
+      ReductionDevice reduction_op;
+
+      typename ReductionDevice::Arguments reduction_args(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
+        problem_size.split_k_slices,
+        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
+        {reinterpret_cast<ElementAccumulator*> (workspace.get()), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
+        {tensor_D_computed.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
+        {tensor_C.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
+        {alpha, beta} // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+      );
+
+      status = reduction_op.initialize(reduction_args, nullptr);
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+
+      // run prallel reduction kernel
+      status = reduction_op();
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+    }
+    bool passed = false;
+    
+    tensor_D_computed.sync_host();
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha, 
+      beta);
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+    
+#else 
+
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha, 
+      beta);
+
+#endif
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(), 
+      tensor_D_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) 
+        << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C 
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C 
+        << "_padding_" 
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w 
+        << "_stride_"  
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w 
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv2d::ThreadblockShape::kM << "x"  
+        << Conv2d::ThreadblockShape::kN << "x"  
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"  
+        << Conv2d::WarpShape::kN << "x"  
+        << Conv2d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nB_reordered =\n" << tensor_B_reordered.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << tensor_D_reference.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D_computed.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm, int InterleavedK>
+bool TestAllInterleavedConv2d(
+  const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  InterleavedTestbedConv2d<ImplicitGemm, InterleavedK> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv2dProblemSizes conv_problems(InterleavedK); // minimum channel size must be multiple of InterleavedK for interleaved layout
+
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+
+    ChannelDivisibilitySpecification channel_spec(InterleavedK); //input and output channels must be multiple of InterleavedK
+    auto pruned_problem_vector = prune(*problem_vector, channel_spec);
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : pruned_problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Procedurally disable certain cases
+      //
+  
+      // CUTLASS DGRAD's unity stride specialization only support stride {1, 1} 
+      if ((ImplicitGemm::kConvolutionalOperator == 
+            cutlass::conv::Operator::kDgrad) && 
+          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          continue;
+        }
+      }
+
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+#if 0
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+    cutlass::conv::SplitKMode::kParallel,
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+#endif
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
new file mode 100644
index 0000000000..07961dd2b7
--- /dev/null
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM50_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x64_8x2_32x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM50_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  64x64_8x2_32x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM50_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x64_8x2_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
new file mode 100644
index 0000000000..a68a30fe5b
--- /dev/null
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -0,0 +1,311 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  64x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  128x128_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::complex<float>;
+  using ElementB           = cutlass::complex<float>;
+  using ElementC           = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementCompute     = cutlass::complex<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
new file mode 100644
index 0000000000..3cbde02888
--- /dev/null
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -0,0 +1,122 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+    
+TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16,
+  128x128_64x3_64x64x64) {
+ 
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+}
+
+
+TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16,
+  128x128_64x3_64x64x64) {
+ 
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+  
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
new file mode 100644
index 0000000000..ffb79d77ad
--- /dev/null
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -0,0 +1,78 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED)
+
+TEST(SM70_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x2_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm70,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM70_SUPPORTED
+
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
new file mode 100644
index 0000000000..1101090a12
--- /dev/null
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -0,0 +1,78 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+TEST(SM75_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x2_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
+
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..ade6f8df32
--- /dev/null
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,161 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  64x256_32x4_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32 >,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kStrided
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
new file mode 100644
index 0000000000..a0aac81147
--- /dev/null
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -0,0 +1,321 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  64x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  test::conv::device::Conv2dProblemVector user_size;
+
+  user_size.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, 4},   // input size  (NHWC)
+      {8, 1, 1, 4},   // filter size (KRSC)
+      {0, 0, 0, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>(user_size));
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  32x64_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..2185257f15
--- /dev/null
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,81 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..211a331d8b
--- /dev/null
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv3d_Dgrad_Analytic_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv3dDgradKernel = typename cutlass::conv::kernel::DefaultConv3dDgrad<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv3dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..0aabef5ba6
--- /dev/null
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv3d_Fprop_Analytic_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv3dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv3d_problems.h b/test/unit/conv/device/conv3d_problems.h
new file mode 100644
index 0000000000..9cc618467e
--- /dev/null
+++ b/test/unit/conv/device/conv3d_problems.h
@@ -0,0 +1,248 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed sizes for Conv2d problem
+*/
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+using Conv3dProblemVector = std::vector<cutlass::conv::Conv3dProblemSize>;
+
+////////////////////////////////////////////////////////////////////////////
+/// Structure TestbedConv3dProblemSizes initializes and holds conv default and 
+/// important network sizes
+////////////////////////////////////////////////////////////////////////////
+struct TestbedConv3dProblemSizes {
+
+  //
+  // Data members
+  //
+  int minimum_channel_size;
+  Conv3dProblemVector conv3d_default_sizes;
+  Conv3dProblemVector conv3d_vnet_medical_sizes;
+
+  //
+  // Methods
+  //
+  /// Default ctor
+  TestbedConv3dProblemSizes(int minimum_channel_size_ = 64): minimum_channel_size (minimum_channel_size_) { 
+
+    initialize_conv3d_default_sizes();
+    initialize_conv3d_vnet_medical_sizes(conv3d_vnet_medical_sizes, 1 /*batch-size*/);
+
+    filter_all();
+  }
+
+  /// Eliminates some illegal cases
+  void filter_all() {
+
+    Conv3dProblemVector *problems_vectors[] = {
+      &conv3d_default_sizes,
+      &conv3d_vnet_medical_sizes
+    };
+
+    for (Conv3dProblemVector *problems : problems_vectors) {
+      Conv3dProblemVector filtered;
+
+      for (cutlass::conv::Conv3dProblemSize const & problem : *problems) {
+        if (!(problem.C % minimum_channel_size)) {
+          filtered.push_back(problem);
+        }
+      }
+
+      *problems = filtered;
+    } 
+  }
+
+  // Add a few standard convolution problem sizes
+  void initialize_conv3d_default_sizes() {
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 3, 3, minimum_channel_size}, // input size  (NDHWC)
+      {8, 1, 1, 1, minimum_channel_size}, // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),       // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),       // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})        // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 16, 16, minimum_channel_size}, // input size  (NDHWC)
+      {8, 1, 3, 3, minimum_channel_size},   // filter size (KTRSC)
+      cutlass::Coord<3>({0, 1, 1}),         // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 15, 19, 160},              // input size  (NDHWC)
+      {224, 1, 3, 6, 160},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),     // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),     // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})      // dilation (dilation_d, dilation_h, dilation_w) 
+    )); 
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 2, 1, 1, minimum_channel_size},  // input size  (NDHWC)
+      {8, 2, 1, 1, minimum_channel_size},  // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),        // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),        // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})         // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1,  1, 7, 7, minimum_channel_size}, // input size  (NDHWC)
+      {16, 1, 3, 3, minimum_channel_size}, // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),        // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),        // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})         // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 11, 15, 19, 64},              // input size  (NDHWC)
+      {32, 4, 3, 6, 64},                // filter size (KTRSC)
+      cutlass::Coord<3>({2, 1, 3}),     // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),     // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})      // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  }
+
+  // Add vnet layers to unit testing sizes 
+  void initialize_conv3d_vnet_medical_sizes(Conv3dProblemVector &conv3d_problem_vector, int batch_size = 1) {
+
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 32, 32, 32, 16},     // input size  (NDHWC)
+      {32, 2, 2, 2, 16},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 32},     // input size  (NDHWC)
+      {32, 3, 3, 3, 32},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 32},     // input size  (NDHWC)
+      {64, 2, 2, 2, 32},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 8, 8, 8, 64},     // input size  (NDHWC)
+      {64, 3, 3, 3, 64},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 8, 8, 8, 64},     // input size  (NDHWC)
+      {128, 2, 2, 2, 64},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 4, 4, 4, 128},     // input size  (NDHWC)
+      {128, 3, 3, 3, 128},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 8, 8, 8, 128},     // input size  (NDHWC)
+      {128, 3, 3, 3, 128},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 64},     // input size  (NDHWC)
+      {64, 3, 3, 3, 64},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 32, 32, 32, 16},     // input size  (NDHWC)
+      {64, 2, 2, 2, 16},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 32},     // input size  (NDHWC)
+      {128, 2, 2, 2, 32},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+  }
+
+};
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h
new file mode 100644
index 0000000000..179520d158
--- /dev/null
+++ b/test/unit/conv/device/conv3d_testbed.h
@@ -0,0 +1,537 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "conv3d_problems.h"
+#include "cutlass/core_io.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+template <typename Conv3d>
+class TestbedConv3d {
+public:
+
+  using ElementA = typename Conv3d::ElementA;
+  using LayoutA = typename Conv3d::LayoutA;
+  using ElementB = typename Conv3d::ElementB;
+  using LayoutB = typename Conv3d::LayoutB;
+  using ElementC = typename Conv3d::ElementC;
+  using LayoutC = typename Conv3d::LayoutC;
+  using ElementAccumulator = typename Conv3d::ElementAccumulator;
+  using ElementCompute = typename Conv3d::ElementCompute;
+  using EpilogueOutputOp = typename Conv3d::EpilogueOutputOp;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv3d::kConvolutionalOperator;
+
+  /// Reduction kernel
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator, 
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
+    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+    EpilogueOutputOp,
+    ReductionOp
+  >;
+
+  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+public:
+
+  TestbedConv3d(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        scope = 4;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv3dProblemSize const &problem_size, uint64_t seed = 2019) {
+        
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Conv3d::ImplicitGemmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv3dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute()) {
+
+		// Waive test if CUDA device is insufficient.
+		if (!sufficient()) {
+			return true;
+		}
+
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv3d conv3d_op;
+
+    typename Conv3d::Arguments conv3d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode
+    );
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv3d::get_workspace_size(conv3d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv3d_op.initialize(conv3d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // conv3d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // conv3d output is written to workspace in global memory
+      conv3d_args.ref_D.reset(reinterpret_cast<ElementAccumulator*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv3d_args.output_op = {1.0, 0.0}; 
+      // update conv3d operator arguments
+      status = conv3d_op.update(conv3d_args, workspace.get());
+    }
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+  
+    // run conv3d operator
+    status = conv3d_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // configure parallel reduction operator 
+      ReductionDevice reduction_op;
+
+      typename ReductionDevice::Arguments reduction_args(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
+        problem_size.split_k_slices,
+        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
+        {reinterpret_cast<ElementAccumulator*> (workspace.get()), tensor_C.stride(Conv3d::ImplicitGemmKernel::kTensorCStrideIdx)},
+        {tensor_D_computed.device_data(), tensor_C.stride(Conv3d::ImplicitGemmKernel::kTensorCStrideIdx)},
+        {tensor_C.device_data(), tensor_C.stride(Conv3d::ImplicitGemmKernel::kTensorCStrideIdx)},
+        {alpha, beta} // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+      );
+
+      status = reduction_op.initialize(reduction_args, nullptr);
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+
+      // run prallel reduction kernel
+      status = reduction_op();
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+    }
+    bool passed = false;
+
+    cutlass::reference::host::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementAccumulator,
+      ElementCompute
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha,
+      beta
+    );
+
+    tensor_D_computed.sync_host();
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(), 
+      tensor_D_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv3d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) 
+        << "ndhwc_"
+        << problem_size.N << "x"
+        << problem_size.D << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C 
+        << "_ktrsc_"
+        << problem_size.K << "x"
+        << problem_size.T << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C 
+        << "_padding_" 
+        << problem_size.pad_d << "x"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w 
+        << "_stride_"  
+        << problem_size.stride_d << "x"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w 
+        << "_dilation_"
+        << problem_size.dilation_d << "x"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv3d::ThreadblockShape::kM << "x"  
+        << Conv3d::ThreadblockShape::kN << "x"  
+        << Conv3d::ThreadblockShape::kK << "_"
+        << Conv3d::WarpShape::kM << "x"  
+        << Conv3d::WarpShape::kN << "x"  
+        << Conv3d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << tensor_D_reference.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D_computed.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionaly, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ImplicitGemm>
+bool TestAllConv3d(
+  const Conv3dProblemVector & conv_test_sizes = Conv3dProblemVector(),
+  const Conv3dProblemVector & conv_blacklist_sizes = Conv3dProblemVector()) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  //TestbedConv3d<ImplicitGemm> testbed(cutlass::Distribution::Sequential, cutlass::Distribution::Sequential, cutlass::Distribution::Sequential);
+  TestbedConv3d<ImplicitGemm> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv3dProblemSizes conv3d_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  //TestbedConv3dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv3d problem sizes to avoid duplicate runs
+  Conv3dProblemVector conv_tested_sizes;
+
+  Conv3dProblemVector const *problem_vectors[] = {
+    &conv3d_problems.conv3d_default_sizes,
+    &conv3d_problems.conv3d_vnet_medical_sizes,
+    &conv_test_sizes
+  };
+
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv3dProblemVector const * problem_vector : problem_vectors) {
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Procedurally disable certain cases
+      //
+  
+      // CUTLASS DGRAD's unity stride specialization only support stride {1, 1} 
+      if ((ImplicitGemm::kConvolutionalOperator == 
+            cutlass::conv::Operator::kDgrad) && 
+          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          continue;
+        }
+      }
+
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+  // Sweep split-k-slice using serial reduction with non-unity alpha and non-zero beta for 
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size (
+    {1, 8, 8, 8, 32},            // input size  (NDHWC)
+    {32, 3, 3, 3, 32},               // filter size (KTRSC)
+    cutlass::Coord<3>({0, 0, 0}),   // padding (pad_d, pad_h, pad_w)
+    cutlass::Coord<3>({1, 1, 1}),   // stride (stride_d, stride_h, stride_w)
+    cutlass::Coord<3>({1, 1, 1})    // dilation (dilation_d, dilation_h, dilation_w) 
+  );
+
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+    cutlass::conv::SplitKMode::kParallel
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv3d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
new file mode 100644
index 0000000000..a3f8409447
--- /dev/null
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
@@ -0,0 +1,78 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+TEST(SM75_Device_Conv3d_Wgrad_Analytic_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x2_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
+
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..9847aede81
--- /dev/null
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,159 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+TEST(SM80_Device_Conv3d_Wgrad_Analytic_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x4_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_Conv3d_Wgrad_Optimized_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x4_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv3d_Wgrad_Optimized_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32,
+  64x256_32x4_64x64x32) {
+  
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
+
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..6dcbf0e726
--- /dev/null
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,120 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv3d_Wgrad_Analytic_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd
+  >::Kernel;
+
+  using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv3d_Wgrad_Optimized_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dWgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
index fcc8426ca3..f3552a1847 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
@@ -804,7 +804,7 @@ TEST(SM61_Epilogue_threadblock_epilogue, simt_i8_i32_32x64_32x64x8) {
   // Output operator
   //
 
-  using OutputOp = cutlass::epilogue::thread::LinearCombinationClamp<
+  using OutputOp = cutlass::epilogue::thread::LinearCombination<
     ElementOutput,
     kElementsPerAccess,
     ElementAccumulator,
@@ -874,7 +874,7 @@ TEST(SM61_Epilogue_threadblock_epilogue, simt_i8_i32_32x128_32x64x8) {
   // Output operator
   //
 
-  using OutputOp = cutlass::epilogue::thread::LinearCombinationClamp<
+  using OutputOp = cutlass::epilogue::thread::LinearCombination<
     ElementOutput,
     kElementsPerAccess,
     ElementAccumulator,
@@ -944,7 +944,7 @@ TEST(SM61_Epilogue_threadblock_epilogue, simt_i8_i32_64x128_32x64x8) {
   // Output operator
   //
 
-  using OutputOp = cutlass::epilogue::thread::LinearCombinationClamp<
+  using OutputOp = cutlass::epilogue::thread::LinearCombination<
     ElementOutput,
     kElementsPerAccess,
     ElementAccumulator,
@@ -1014,7 +1014,7 @@ TEST(SM61_Epilogue_threadblock_epilogue, simt_i8_i32_128x128_32x64x8) {
   // Output operator
   //
 
-  using OutputOp = cutlass::epilogue::thread::LinearCombinationClamp<
+  using OutputOp = cutlass::epilogue::thread::LinearCombination<
     ElementOutput,
     kElementsPerAccess,
     ElementAccumulator,
@@ -1084,7 +1084,7 @@ TEST(SM61_Epilogue_threadblock_epilogue, simt_i8_i32_128x64_32x64x8) {
   // Output operator
   //
 
-  using OutputOp = cutlass::epilogue::thread::LinearCombinationClamp<
+  using OutputOp = cutlass::epilogue::thread::LinearCombination<
     ElementOutput,
     kElementsPerAccess,
     ElementAccumulator,
diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt
index 84247e0bdc..7ead7eba54 100644
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@@ -20,85 +20,109 @@
 # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-cutlass_test_unit_add_executable(
+add_custom_target(
   cutlass_test_unit_gemm_device
+  DEPENDS
+  cutlass_test_unit_gemm_device_simt
+  cutlass_test_unit_gemm_device_tensorop_sm70
+  cutlass_test_unit_gemm_device_tensorop_sm75
+  cutlass_test_unit_gemm_device_tensorop_f16_sm80
+  cutlass_test_unit_gemm_device_tensorop_f32_sm80
+  cutlass_test_unit_gemm_device_tensorop_f32_tf32_sm80
+  cutlass_test_unit_gemm_device_tensorop_f64
+  cutlass_test_unit_gemm_device_tensorop_s32_sm80
+  cutlass_test_unit_gemm_device_wmma
+  cutlass_test_unit_gemm_device_tensorop_planar_complex
+  cutlass_test_unit_gemm_device_sparse_tensorop_sm80
+)
+
+add_custom_target(
+  test_unit_gemm_device
+  DEPENDS
+  test_unit_gemm_device_simt
+  test_unit_gemm_device_tensorop_sm70
+  test_unit_gemm_device_tensorop_sm75
+  test_unit_gemm_device_tensorop_f16_sm80
+  test_unit_gemm_device_tensorop_f32_sm80
+  test_unit_gemm_device_tensorop_f32_tf32_sm80
+  test_unit_gemm_device_tensorop_f64
+  test_unit_gemm_device_tensorop_s32_sm80
+  test_unit_gemm_device_wmma
+  test_unit_gemm_device_tensorop_planar_complex
+  test_unit_gemm_device_sparse_tensorop_sm80
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_simt
 
   BATCH_SOURCES ON
   BATCH_SIZE 4
 
-  gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu                              
-  gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu                                         
-  gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
+  simt_sgemm_nt_sm80.cu
+  simt_sgemm_tn_sm80.cu
+  
+  simt_cgemm_nn_sm50.cu
+  simt_cgemm_nt_sm50.cu
+  simt_cgemm_tn_sm50.cu
+  simt_cgemm_tt_sm50.cu
 
-  gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu                                     
-  gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu                             
-  gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
-  gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
+  simt_dgemm_nn_sm50.cu
+  simt_dgemm_nt_sm50.cu
+  simt_dgemm_tn_sm50.cu
+  simt_dgemm_tt_sm50.cu
 
-  gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
-  gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
+  simt_hgemm_nn_sm50.cu
+  simt_hgemm_nt_sm50.cu
+  simt_hgemm_tn_sm50.cu
+  simt_hgemm_tt_sm50.cu
 
-  gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
-  gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+  simt_igemm_nn_sm50.cu
+  simt_igemm_nt_sm50.cu
+  simt_igemm_tn_sm50.cu
+  simt_igemm_tt_sm50.cu
 
-  gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
-  gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
+  simt_int8_igemm_sm61_sliced_k.cu
+  simt_int8_igemm_sm61.cu
 
-  gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
-  gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
-  gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
-  gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
-  gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
-  gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
-  gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
-  gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
-  gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
-  gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
-  gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
-  gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
-  gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
-  gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
-  gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
+  simt_sgemm_nn_sm50.cu
+  simt_sgemm_nt_sm50.cu
+  simt_sgemm_tn_sm50.cu
+  simt_sgemm_tt_sm50.cu
 
-  gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
-  gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
-  gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
-  gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
-  gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
-  gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
-  gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
-  gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
-  gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
-  gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
-  gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
-  gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
-  gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
+  simt_zgemm_nn_sm50.cu
+  simt_zgemm_nt_sm50.cu
+  simt_zgemm_tn_sm50.cu
+  simt_zgemm_tt_sm50.cu
 
-  gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
-  gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
+  gemm_splitk_simt_sm50.cu
+)
 
-  simt_sgemm_nt_sm80.cu
-  simt_sgemm_tn_sm80.cu
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_sm70
 
-  gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
-  gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
-  gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
-  gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
-  gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
-  gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
-  gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
-  gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
 
-  gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
-  gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
 
-  gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
-  gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
+  gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
+  gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
+  gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
+  gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
 
-  gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
-  gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
+  gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
+
+  gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
+  gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
+
+  gemm_splitk_tensor_op_sm70.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_sm75
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
 
-  gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
   gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
   gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
   gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -123,54 +147,105 @@ cutlass_test_unit_add_executable(
   gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
   gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
   gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
+  
+  gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
+  gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
 
-  gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
-  gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
-  gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
-  gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
+  gemm_splitk_serial_tensor_op_sm75.cu
+  gemm_splitk_tensor_op_sm75.cu
 
-  gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
+)
 
-  gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
-  gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_f16_sm80
 
-  simt_cgemm_nn_sm50.cu
-  simt_cgemm_nt_sm50.cu
-  simt_cgemm_tn_sm50.cu
-  simt_cgemm_tt_sm50.cu
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
 
-  simt_dgemm_nn_sm50.cu
-  simt_dgemm_nt_sm50.cu
-  simt_dgemm_tn_sm50.cu
-  simt_dgemm_tt_sm50.cu
+  gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
+  gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
+)
 
-  simt_hgemm_nn_sm50.cu
-  simt_hgemm_nt_sm50.cu
-  simt_hgemm_tn_sm50.cu
-  simt_hgemm_tt_sm50.cu
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_f32_sm80
 
-  simt_igemm_nn_sm50.cu
-  simt_igemm_nt_sm50.cu
-  simt_igemm_tn_sm50.cu
-  simt_igemm_tt_sm50.cu
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
 
-  simt_int8_igemm_sm61_sliced_k.cu
-  simt_int8_igemm_sm61.cu
+  gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu
+  gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
+  gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
+  gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
+  gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
+  gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
+  gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
+  gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
+  gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
+  gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
+  gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
+  gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
+)
 
-  simt_sgemm_nn_sm50.cu
-  simt_sgemm_nt_sm50.cu
-  simt_sgemm_tn_sm50.cu
-  simt_sgemm_tt_sm50.cu
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_f32_tf32_sm80
 
-  simt_zgemm_nn_sm50.cu
-  simt_zgemm_nt_sm50.cu
-  simt_zgemm_tn_sm50.cu
-  simt_zgemm_tt_sm50.cu
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
 
-  gemm_splitk_serial_tensor_op_sm75.cu
-  gemm_splitk_tensor_op_sm75.cu
-  gemm_splitk_tensor_op_sm70.cu
-  gemm_splitk_simt_sm50.cu
+  gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
+  gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
+  gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
+  gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu                 
+  gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
+  gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
+  gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
+
+  gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
+  gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_f64
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
+  gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
+
+  gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu                             
+  gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+  gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
+  gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
+  gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+  gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_s32_sm80
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
+  gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
+  gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
+  gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
+  gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
+  gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
+  gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
+  gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
+
+  gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
+  gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_wmma
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
 
   # wmma floating point tests
   gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
@@ -222,5 +297,37 @@ cutlass_test_unit_add_executable(
   gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
 
   gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_planar_complex
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu                              
+  gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu                                         
+  gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_sparse_tensorop_sm80
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
 
+  gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
+  gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
+  gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
+  gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
+  gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
+  gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
+  gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
+  gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
+  gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
+  gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
+  gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
+  gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
+  gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
 )
+
diff --git a/test/unit/gemm/device/multistage_testbed.h b/test/unit/gemm/device/multistage_testbed.h
index bdc4b77081..f7b6ac8f56 100644
--- a/test/unit/gemm/device/multistage_testbed.h
+++ b/test/unit/gemm/device/multistage_testbed.h
@@ -97,10 +97,45 @@ struct MultistageTestbed {
     return true;
   }
 
+  /// Waives test if CUDA device is insufficient
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
   /// Executes one test
   bool run(cutlass::gemm::GemmCoord problem_size,
            ElementCompute alpha = ElementCompute(1),
            ElementCompute beta = ElementCompute(0)) {
+
+		// Waives test if CUDA device is insufficient
+		if (!sufficient()) {
+			return true;
+		}
+
     //
     // Allocate the GEMM workspace
     //
@@ -144,7 +179,11 @@ struct MultistageTestbed {
 
     cutlass::Status status = gemm_op.initialize(arguments);
 
-    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
 
     //
     // Run the GEMM
diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
index 7d2ab45b6f..f0fe1ebd94 100644
--- a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
+++ b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
@@ -39,7 +39,8 @@
 #include "cutlass/util/tensor_view_io.h"
 
 #include "testbed.h"
-
+    
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 32x64x8_32x64x1) {
@@ -246,4 +247,8 @@ TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x256x8_64x64x1) {
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 }
 
-////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
index 00461d2e0f..c183fbff34 100644
--- a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
+++ b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
@@ -41,8 +41,10 @@
 
 #include "testbed.h"
 
+    
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
 ////////////////////////////////////////////////////////////////////////////////
-
+    
 TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 32x64x8_32x64x1) {
   
   using Element = float;
@@ -246,4 +248,8 @@ TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x256x8_64x64x1) {
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 }
 
-////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h
index b8c739a7e9..c2bf40ec21 100644
--- a/test/unit/gemm/device/testbed.h
+++ b/test/unit/gemm/device/testbed.h
@@ -247,6 +247,36 @@ struct Testbed {
     return compare_reference(problem_size, alpha, beta);
   }
 
+	/// Determine if the CUDA device is sufficient to run the kernel
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+
   /// Executes one test
   bool run(
     cutlass::gemm::GemmCoord problem_size, 
@@ -254,6 +284,10 @@ struct Testbed {
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
 
+		// Waive test if insufficient CUDA device
+		if (!sufficient()) {
+			return true;
+		}
 
     this->initialize(problem_size);
 
@@ -279,7 +313,11 @@ struct Testbed {
 
     cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
 
-    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
 
     //
     // Run the GEMM
diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h
index 65c0fdfb4c..a3e1353ee1 100644
--- a/test/unit/gemm/device/testbed_complex.h
+++ b/test/unit/gemm/device/testbed_complex.h
@@ -31,6 +31,7 @@
 #include <iostream>
 #include <fstream>
 #include <sstream>
+#include <stdexcept>
 
 #include "../../common/cutlass_unit_test.h"
 
@@ -100,6 +101,34 @@ struct TestbedComplex : public Testbed<Gemm> {
     return this->compare_reference(problem_size, alpha, beta);
   }
 
+	bool sufficient() const {
+		//
+		// Determine SMEM requirements and waive if not satisfied
+		//
+
+		int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+
+		cudaDeviceProp properties;
+		int device_idx;
+		cudaError_t result = cudaGetDevice(&device_idx);
+	
+		if (result != cudaSuccess) {
+			throw std::runtime_error("cudaGetDevice() API call failed.");
+		}
+		
+		result = cudaGetDeviceProperties(&properties, device_idx);
+
+		if (result != cudaSuccess) {
+			throw std::runtime_error("cudaGetDeviceProperties() failed");
+		}
+
+		if (properties.sharedMemPerMultiprocessor < smem_size) {
+			return false;
+		}
+
+		return true;
+	}
+
   /// Executes one test
   bool run(
     cutlass::gemm::GemmCoord problem_size, 
@@ -107,7 +136,17 @@ struct TestbedComplex : public Testbed<Gemm> {
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
 
+		// Waive the test if device not sufficient
+		if (!sufficient()) {
+			return true;
+		}
+
+		//
+		// Initialize workspace
+		//
+
     this->initialize(problem_size);
+		
 
     //
     // Initialize the GEMM operator
diff --git a/test/unit/gemm/device/testbed_interleaved.h b/test/unit/gemm/device/testbed_interleaved.h
index 3cbd720bd4..6e14f87f6e 100644
--- a/test/unit/gemm/device/testbed_interleaved.h
+++ b/test/unit/gemm/device/testbed_interleaved.h
@@ -99,6 +99,35 @@ struct InterleavedTestbed {
       return false;
     }
 
+    return true;
+  }
+
+	/// Waives test if CUDA device is insufficient
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
     return true;
   }
 
@@ -107,6 +136,10 @@ struct InterleavedTestbed {
     cutlass::gemm::GemmCoord problem_size, 
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
+
+		if (!sufficient()) {
+			return true;
+		}
     
     //
     // Allocate the GEMM workspace
diff --git a/test/unit/gemm/device/testbed_sparse.h b/test/unit/gemm/device/testbed_sparse.h
index d1d57b893c..28901a9867 100644
--- a/test/unit/gemm/device/testbed_sparse.h
+++ b/test/unit/gemm/device/testbed_sparse.h
@@ -295,6 +295,34 @@ struct SparseTestbed {
     return compare_reference(problem_size, alpha, beta);
   }
 
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
   /// Executes one test
   bool run(
     cutlass::gemm::GemmCoord problem_size, 
@@ -302,6 +330,11 @@ struct SparseTestbed {
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
 
+		// Waive test if insufficient CUDA device
+		if (!sufficient()) {
+			return true;
+		}
+
     this->initialize(problem_size);
 
     //
@@ -327,7 +360,10 @@ struct SparseTestbed {
 
     cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
 
-    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+		// This failure is likely due to insufficient device capabilities. Waive the test.
+    if (status != cutlass::Status::kSuccess) {
+      return true;
+    }
 
     //
     // Run the GEMM
diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h
index a83c27cda6..fb36f10e25 100644
--- a/test/unit/gemm/device/testbed_universal.h
+++ b/test/unit/gemm/device/testbed_universal.h
@@ -250,6 +250,34 @@ struct TestbedUniversal {
     return compare_reference(problem_size, alpha, beta);
   }
 
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
   /// Executes one test
   bool run(
     cutlass::gemm::GemmUniversalMode mode,
@@ -258,6 +286,11 @@ struct TestbedUniversal {
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
 
+		// Waive test if insufficient CUDA device
+		if (!sufficient()) {
+			return true;
+		}
+
     this->initialize(problem_size);
 
     //
diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
index 7036e26d97..d667d8f550 100644
--- a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
@@ -328,19 +328,17 @@ struct SparseTestbed {
           test::gemm::threadblock::kernel_multistage_mma_sparse<Mma>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
 
-      EXPECT_EQ(result, cudaSuccess)
-          << " cudaFuncSetAttribute "
-             "cudaFuncAttributeMaxDynamicSharedMemorySize error: "
-          << cudaGetErrorString(result);
+      if (result != cudaSuccess) {
+          return true;
+      }
 
       result = cudaFuncSetAttribute(
           test::gemm::threadblock::kernel_multistage_mma_sparse<Mma>,
           cudaFuncAttributePreferredSharedMemoryCarveout, 100);
 
-      EXPECT_EQ(result, cudaSuccess)
-          << " cudaFuncSetAttribute "
-             "cudaFuncAttributePreferredSharedMemoryCarveout error: "
-          << cudaGetErrorString(result);
+      if (result != cudaSuccess) {
+          return true;
+      }
     }
 
     test::gemm::threadblock::kernel_multistage_mma_sparse<Mma>
diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed.h b/test/unit/gemm/threadblock/mma_multistage_testbed.h
index 3870dd22fb..6b8dc94fb6 100644
--- a/test/unit/gemm/threadblock/mma_multistage_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_testbed.h
@@ -266,19 +266,17 @@ struct Testbed {
           test::gemm::threadblock::kernel_multistage_mma<Mma>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
 
-      EXPECT_EQ(result, cudaSuccess)
-          << " cudaFuncSetAttribute "
-             "cudaFuncAttributeMaxDynamicSharedMemorySize error: "
-          << cudaGetErrorString(result);
+      if (result != cudaSuccess) {
+        return true;
+      }
 
       result = cudaFuncSetAttribute(
           test::gemm::threadblock::kernel_multistage_mma<Mma>,
           cudaFuncAttributePreferredSharedMemoryCarveout, 100);
 
-      EXPECT_EQ(result, cudaSuccess)
-          << " cudaFuncSetAttribute "
-             "cudaFuncAttributePreferredSharedMemoryCarveout error: "
-          << cudaGetErrorString(result);
+      if (result != cudaSuccess) {
+          return true;
+      }
     }
 
     test::gemm::threadblock::kernel_multistage_mma<Mma>
diff --git a/test/unit/gemm/warp/gemm_sm70.cu b/test/unit/gemm/warp/gemm_sm70.cu
index 16f1427e55..3785290e5c 100644
--- a/test/unit/gemm/warp/gemm_sm70.cu
+++ b/test/unit/gemm/warp/gemm_sm70.cu
@@ -199,6 +199,91 @@ TEST(SM70_warp_gemm_tensor_op_crosswise, 64x64x32_64x64x32_16x16x4) {
 
   test::gemm::warp::Testbed<MmaTensorOp, cutlass::gemm::GemmShape<64, 64, 32> >().run();
 }
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_warp_gemm_volta_tensor_op_canonical_f32_row_col, 64x64x16_64x64x4_8x8x4) {
+  
+  using Shape = cutlass::gemm::GemmShape<64, 64, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using ElementA = cutlass::half_t;
+  using ElementB = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Mma<
+      cutlass::gemm::GemmShape<16, 16, 4>,
+      32,
+      ElementA,
+      cutlass::layout::RowMajor,
+      ElementB,
+      cutlass::layout::ColumnMajor,
+      ElementC,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpMultiplyAdd
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
+    Shape,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    cutlass::layout::RowMajor,
+    Policy
+  >;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 16> >()
+      .run();
+}
+
+TEST(SM70_warp_gemm_volta_tensor_op_canonical_f32_col_row, 64x64x16_64x64x4_8x8x4) {
+  
+  using Shape = cutlass::gemm::GemmShape<64, 64, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using ElementA = cutlass::half_t;
+  using ElementB = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Mma<
+      cutlass::gemm::GemmShape<16, 16, 4>,
+      32,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpMultiplyAdd
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
+    Shape,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    cutlass::layout::RowMajor,
+    Policy
+  >;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 16> >()
+      .run();
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #endif // CUTLASS_ARCH_MMA_SM70_SUPPORTED
diff --git a/test/unit/gemm/warp/testbed.h b/test/unit/gemm/warp/testbed.h
index c0c98d80df..3cc00fb447 100644
--- a/test/unit/gemm/warp/testbed.h
+++ b/test/unit/gemm/warp/testbed.h
@@ -30,6 +30,7 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_types.h"
 #include "cutlass/subbyte_reference.h"
 #include "cutlass/platform/platform.h"
 
@@ -1019,9 +1020,11 @@ __global__ void sparse_kernel(
     typename Mma::ElementB, ThreadblockShape::kN * ThreadblockShape::kK> smem_buffer_B;
 
   __shared__ cutlass::AlignedBuffer<
-      typename Mma::ElementE, ThreadblockShape::kM * ThreadblockShape::kK /
+      typename Mma::ElementE, Mma::Shape::kM * Mma::Shape::kK /
                                   Mma::kSparse / Mma::kElementsPerElementE>
       smem_buffer_E;
+  
+  __syncthreads();
 
   if (threadIdx.x == 0) {
     typename Mma::ElementA *smem_ptr_A = smem_buffer_A.data();
@@ -1168,6 +1171,7 @@ struct SparseTestbed {
 
   /// Allocates workspace in device memory
   SparseTestbed() {
+
     tensor_A.reset(cutlass::make_Coord(ThreadblockShape::kM,
                                        ThreadblockShape::kK / Sparse));
     tensor_A_uncompressed.reset(
diff --git a/test/unit/reduction/CMakeLists.txt b/test/unit/reduction/CMakeLists.txt
index 7b4f267069..96c3716141 100644
--- a/test/unit/reduction/CMakeLists.txt
+++ b/test/unit/reduction/CMakeLists.txt
@@ -22,7 +22,6 @@
 
 add_subdirectory(thread)
 add_subdirectory(kernel)
-
 add_custom_target(
   cutlass_test_unit_reduction
   DEPENDS
diff --git a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
index e52af8edf9..8d2382e4cf 100644
--- a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
+++ b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
@@ -81,7 +81,7 @@ __global__ void kernel_gemm_threadblock_tensor_op_multiplicand_store(
     }
   }
 
-  // Use iterator to scatter results
+  // Use iterator to store results
   Iterator iter(ref_output, threadIdx.x);
   iter.store(frag);
 }
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 3ca637b2db..e43c821e64 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -26,6 +26,12 @@ if (CUTLASS_ENABLE_LIBRARY)
   add_subdirectory(library)
 endif()
 if (CUTLASS_ENABLE_PROFILER)
-  add_subdirectory(profiler)
+  if (NOT CUTLASS_ENABLE_LIBRARY)
+    message(SEND_ERROR "Build conflict: The CUTLASS profiler requires the CUTLASS library.")
+    message(SEND_ERROR "  CUTLASS_ENABLE_PROFILER = ${CUTLASS_ENABLE_PROFILER}")
+    message(SEND_ERROR "  CUTLASS_ENABLE_LIBRARY = ${CUTLASS_ENABLE_LIBRARY}")
+  else()
+    add_subdirectory(profiler)
+  endif()
 endif()
 
diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
index 294cd98f01..4bf7577fb8 100644
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@@ -63,6 +63,15 @@ cutlass_add_library(
   src/reference/gemm.cu
   src/reference/initialize_reference_operations.cu
 
+
+  # cutlass reduction instances in cutlass library
+  src/reduction/reduction_device.cu
+  src/reduction/init_reduction_operations.cu
+  
+  # cutlass conv reference instances in cutlass library
+  src/reference/conv2d.cu
+  src/reference/conv3d.cu
+
   )
 
 file(GLOB_RECURSE GENERATOR_PYTHON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/scripts/*.py)
@@ -136,7 +145,7 @@ function(cutlass_add_cutlass_library)
     cutlass_library_includes
     )
 
-  set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX ${CUTLASS_LIBRARY_DEBUG_POSTFIX})
+  set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}")
   
   set(OUTPUT_NAME cutlass)
 
diff --git a/tools/library/include/cutlass/library/handle.h b/tools/library/include/cutlass/library/handle.h
index 58c6b30c7c..27d2bfe6a4 100644
--- a/tools/library/include/cutlass/library/handle.h
+++ b/tools/library/include/cutlass/library/handle.h
@@ -335,6 +335,10 @@ class Handle {
 using HandlePtr = std::unique_ptr<Handle>;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+/// Finds conv2d operation instances with Conv2d::ElementC = Reduction::ElementWorkspace
+Operation const* find_conv_operation_for_parallel_reduction(Operation const *operation);
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace library
 } // namespace cutlass
 
diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h
index f692437199..6a018a704c 100644
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@@ -53,6 +53,10 @@
 #include "cutlass/layout/tensor.h"
 
 #include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -79,6 +83,10 @@ enum class LayoutTypeID {
   kTensorNCDHW,
   kTensorNHWC,
   kTensorNDHWC,
+  kTensorNC32HW32,
+  kTensorC32RSK32,
+  kTensorNC64HW64,
+  kTensorC64RSK64,
   kInvalid
 };
   
@@ -138,6 +146,7 @@ enum class Provider {
   kReferenceHost,
   kReferenceDevice,
   kCUBLAS,
+  kCUDNN,               
   kInvalid
 };
 
@@ -146,6 +155,8 @@ enum class Provider {
 /// Enumeration indicating the kind of operation
 enum class OperationKind {
   kGemm,
+  kConv2d,              
+  kConv3d,             
   kEqGemm,
   kSparseGemm,
   kReduction,
@@ -204,6 +215,30 @@ enum class GemmKind {
 /// Mode of Universal GEMM
 using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
 
+/// Enumeration indicating what kind of Conv2d operation to perform
+enum class ConvKind {
+  kUnknown,
+  kFprop,
+  kDgrad,
+  kWgrad,
+  kInvalid
+};
+
+enum class ConvModeID {
+  kCrossCorrelation,
+  kConvolution,
+  kInvalid
+};
+
+// Iterator algorithm enum in order of general performance-efficiency
+enum class IteratorAlgorithmID {
+  kNone,
+  kAnalytic,
+  kOptimized,
+  kInvalid
+};
+
+
 enum class EpilogueKind {
   kUnknown,
   kConversion,
@@ -477,6 +512,66 @@ struct ReductionDescription : public OperationDescription {
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description of all Conv2d operations
+struct ConvDescription : public OperationDescription {
+  /// Describes the convolution dimension support (2D or 3D)
+  int conv_dim;
+  
+  /// Describes the kind of convolution
+  ConvKind conv_kind;
+
+  /// Describes the type of iterator algorithm (analytic or precomputed)
+  IteratorAlgorithmID iterator_algorithm;
+
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand
+  TensorDescription B;
+
+  /// Describes the C operand
+  TensorDescription C;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  //
+  // Methods
+  //
+  // Returns Activation TensorDescription
+  TensorDescription activation() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return A;
+      case library::ConvKind::kDgrad : return C;
+      case library::ConvKind::kWgrad : return B;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Filter TensorDescription
+  TensorDescription filter() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return B;
+      case library::ConvKind::kDgrad : return B;
+      case library::ConvKind::kWgrad : return C;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Output TensorDescription
+  TensorDescription output() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return C;
+      case library::ConvKind::kDgrad : return A;
+      case library::ConvKind::kWgrad : return A;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+};
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Base class for all operations
@@ -825,6 +920,204 @@ struct SparseGemmArguments {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Two dimensional convolution
+//
+// OperationKind: Conv2d
+//
+struct Conv2dConfiguration {
+
+  conv::SplitKMode split_k_mode;
+  
+  /// Conv2d problem size 
+  //  contains strictly conv2d size (N,H,W,C,K,R,S,P,Q,padding,stride,dilation,mode)
+  //  also includes (split_k_slices, groups)
+  conv::Conv2dProblemSize problem_size;
+
+  /// Layout object for activations tensor
+  layout::TensorNHWC layout_activations;
+
+  /// Layout object for filters tensor
+  layout::TensorNHWC layout_filters;
+
+  /// Layout object for source tensor
+  layout::TensorNHWC layout_source;
+
+  /// Layout object for output tensor
+  layout::TensorNHWC layout_output;
+
+  //
+  // Methods 
+  //
+
+  // Mapping functions (A,B,C -> activation,filter,output)
+  layout::TensorNHWC layout_a(library::ConvKind const &conv_kind) const {
+    switch (conv_kind) {
+      case library::ConvKind::kFprop: return layout_activations;
+      case library::ConvKind::kDgrad: return layout_output;
+      case library::ConvKind::kWgrad: return layout_output;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  layout::TensorNHWC layout_b(library::ConvKind const &conv_kind) const {
+    switch (conv_kind) {
+      case library::ConvKind::kFprop: return layout_filters;
+      case library::ConvKind::kDgrad: return layout_filters;
+      case library::ConvKind::kWgrad: return layout_activations;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  layout::TensorNHWC layout_c(library::ConvKind const &conv_kind) const {
+    switch (conv_kind) {
+      case library::ConvKind::kFprop: return layout_output;
+      case library::ConvKind::kDgrad: return layout_activations;
+      case library::ConvKind::kWgrad: return layout_filters;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+};
+
+
+/// Three dimensional convolution
+//
+// OperationKind: Conv3d
+//
+struct Conv3dConfiguration {
+
+  conv::SplitKMode split_k_mode;
+  
+  /// Conv2d problem size 
+  //  contains strictly conv2d size (N,D,H,W,C,K,T,R,S,Z,P,Q,padding,stride,dilation,mode)
+  //  also includes (split_k_slices, groups)
+  conv::Conv3dProblemSize problem_size;
+
+  /// Layout object for activations tensor
+  layout::TensorNDHWC layout_activations;
+
+  /// Layout object for filters tensor
+  layout::TensorNDHWC layout_filters;
+
+  /// Layout object for source tensor
+  layout::TensorNDHWC layout_source;
+
+  /// Layout object for output tensor
+  layout::TensorNDHWC layout_output;
+
+  //
+  // Methods 
+  //
+
+  // Mapping functions (A,B,C -> activation,filter,output)
+  layout::TensorNDHWC layout_a(library::ConvKind const &conv_kind) const {
+    switch (conv_kind) {
+      case library::ConvKind::kFprop: return layout_activations;
+      case library::ConvKind::kDgrad: return layout_output;
+      case library::ConvKind::kWgrad: return layout_output;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  layout::TensorNDHWC layout_b(library::ConvKind const &conv_kind) const {
+    switch (conv_kind) {
+      case library::ConvKind::kFprop: return layout_filters;
+      case library::ConvKind::kDgrad: return layout_filters;
+      case library::ConvKind::kWgrad: return layout_activations;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  layout::TensorNDHWC layout_c(library::ConvKind const &conv_kind) const {
+    switch (conv_kind) {
+      case library::ConvKind::kFprop: return layout_output;
+      case library::ConvKind::kDgrad: return layout_activations;
+      case library::ConvKind::kWgrad: return layout_filters;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+};
+
+/// Arguments for CONV
+struct ConvArguments {
+
+  /////////////////////////////////////////////////////////
+  /// ImplicitGemm matrices A, B, C, D
+  /////////////////////////////////////////////////////////
+  /// pointer to implicit gemm matrix A
+  void const *A;
+
+  /// pointer to implicit gemm matrix B
+  void const *B;
+
+  /// pointer to implicit gemm matrix C
+  void const *C;
+
+  /// pointer to implicit gemm desitination matrix D
+  void *D;
+
+  /// Host or device pointer to alpha scalar
+  void const *alpha;
+
+  /// Host or device pointer to beta scalar
+  void const *beta;
+
+  /// Enumerant indicating whether alpha/beta point to host or device memory
+  ScalarPointerMode pointer_mode;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Configuration for Reduction operations
+//
+// OperationKind: Reduction
+//
+struct ReductionConfiguration {
+
+  /// Redcution problem size
+  MatrixCoord problem_size;
+
+  /// Number of partitions to reduce
+  int partitions;
+
+  /// Number of lements between each partition
+  int64_t partition_stride;
+
+  /// leading dimension of 'w'orksace operand
+  int64_t ldw; 
+
+  /// leading dimension of 's'ource operand
+  int64_t lds;
+
+  /// leading dimension of 'd'estination operand
+  int64_t ldd;
+};
+
+/// Arguments for Reduction
+struct ReductionArguments {
+
+  /// Pointer to workspace matrix
+  void const *workspace;
+
+  /// Pointer to source matrix
+  void const *source;
+
+  /// Pointer to destination matrix
+  void *destination;
+
+  /// pointer to reference matrix
+  void *reference;
+
+  /// Host or device pointer to alpha scalar
+  void const *alpha;
+
+  /// Host or device pointer to beta scalar
+  void const *beta;
+
+  /// Enumerant indicating whether alpha/beta point to host or device memory
+  ScalarPointerMode pointer_mode;
+};
+
 } // namespace library
 } // namespace cutlass
 
diff --git a/tools/library/include/cutlass/library/manifest.h b/tools/library/include/cutlass/library/manifest.h
index 7adf0fbbce..2bde2884b4 100644
--- a/tools/library/include/cutlass/library/manifest.h
+++ b/tools/library/include/cutlass/library/manifest.h
@@ -51,6 +51,9 @@ class Manifest;
 // init and insert all cutlass gemm operations in manifest object (procedurally generated using generator.py)
 void initialize_all(Manifest &manifest);         
 
+// init and insert all reduction op in manifest object (manually instantiated in library/reduction)
+void initialize_all_reduction_op(Manifest &manifest);
+
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// List of operations
diff --git a/tools/library/include/cutlass/library/operation_table.h b/tools/library/include/cutlass/library/operation_table.h
index 3821f65acb..ba19ca123c 100644
--- a/tools/library/include/cutlass/library/operation_table.h
+++ b/tools/library/include/cutlass/library/operation_table.h
@@ -208,6 +208,262 @@ using GemmOperationFunctionalMap = std::unordered_map<
 >;
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          Data Structures for Conv Functional Maps
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tuple uniquely identifying conv2d functional behavior
+struct ConvFunctionalKey {
+  library::Provider provider;
+  library::ConvKind conv_kind;
+  library::NumericTypeID element_A;
+  library::LayoutTypeID layout_A;
+  library::NumericTypeID element_B;
+  library::LayoutTypeID layout_B;
+  library::NumericTypeID element_C;
+  library::LayoutTypeID layout_C;
+  library::NumericTypeID element_accumulator;
+  library::NumericTypeID element_compute;
+
+
+  //
+  // Methods
+  //
+
+  inline
+  ConvFunctionalKey(
+    library::Provider provider = library::Provider::kInvalid,
+    library::ConvKind conv_kind = library::ConvKind::kFprop,
+    library::NumericTypeID element_A = library::NumericTypeID::kF16,
+    library::LayoutTypeID layout_A = library::LayoutTypeID::kTensorNHWC,
+    library::NumericTypeID element_B = library::NumericTypeID::kF16,
+    library::LayoutTypeID layout_B = library::LayoutTypeID::kTensorNHWC,
+    library::NumericTypeID element_C = library::NumericTypeID::kF16,
+    library::LayoutTypeID layout_C = library::LayoutTypeID::kTensorNHWC,
+    library::NumericTypeID element_accumulator = library::NumericTypeID::kF32,
+    library::NumericTypeID element_compute = library::NumericTypeID::kF32
+  ):
+    provider(provider),
+    conv_kind(conv_kind),
+    element_A(element_A),
+    layout_A(layout_A),
+    element_B(element_B),
+    layout_B(layout_B),
+    element_C(element_C),
+    layout_C(layout_C),
+    element_accumulator(element_accumulator),
+    element_compute(element_compute)
+  { } 
+
+  inline 
+  bool operator==(ConvFunctionalKey const &rhs) const {
+    return
+      (provider == rhs.provider) &&
+      (conv_kind == rhs.conv_kind) &&
+      (element_A == rhs.element_A) &&
+      (layout_A == rhs.layout_A) &&
+      (element_B == rhs.element_B) &&
+      (layout_B == rhs.layout_B) &&
+      (element_C == rhs.element_C) &&
+      (layout_C == rhs.layout_C) &&
+      (element_accumulator == rhs.element_accumulator) &&
+      (element_compute == rhs.element_compute);
+  }
+
+  inline 
+  bool operator!=(ConvFunctionalKey const &rhs) const {
+    return !(*this == rhs);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+inline
+std::ostream& operator<< (std::ostream& out, const cutlass::library::ConvFunctionalKey& key) {
+    out << "{\n"
+      << "provider: " << to_string(key.provider) << std::endl
+      << "conv_kind: " << to_string(key.conv_kind) << std::endl
+      << "element_A: " << to_string(key.element_A) << std::endl
+      << "layout_A: " << to_string(key.layout_A) << std::endl
+      << "element_B: " << to_string(key.element_B) << std::endl
+      << "layout_B: " << to_string(key.layout_B) << std::endl
+      << "element_C: " << to_string(key.element_C) << std::endl
+      << "layout_C: " << to_string(key.layout_C) << std::endl
+      << "element_accumulator: " << to_string(key.element_accumulator) << std::endl
+      << "element_compute: " << to_string(key.element_compute) << std::endl
+      << "}";
+  
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+struct ConvFunctionalKeyHasher {
+  using IntHash = std::hash<int>;
+
+  inline
+  static size_t rotl(size_t key, int shl) {
+    return (key << shl) | (key >> (sizeof(key)*8 - shl));
+  }
+
+  inline
+  size_t operator()(ConvFunctionalKey const &key) const {
+    IntHash hash;
+
+    return 
+      rotl(hash(int(key.provider)), 1) ^
+      rotl(hash(int(key.conv_kind)), 2) ^
+      rotl(hash(int(key.element_A)), 3) ^
+      rotl(hash(int(key.layout_A)), 4) ^
+      rotl(hash(int(key.element_B)), 5) ^
+      rotl(hash(int(key.layout_B)), 6) ^
+      rotl(hash(int(key.element_C)), 7) ^
+      rotl(hash(int(key.layout_C)), 8) ^
+      rotl(hash(int(key.element_accumulator)), 9) ^
+      rotl(hash(int(key.element_compute)), 10);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Establishes a partial ordering to search for Conv2d operators
+struct ConvPreferenceKey {
+
+  int compute_capability;
+  IteratorAlgorithmID iterator_algorithm;
+
+
+  //
+  // Methods
+  //
+
+  ConvPreferenceKey(): compute_capability(), iterator_algorithm() { }
+
+  ConvPreferenceKey(int cc, IteratorAlgorithmID iterator_algorithm): 
+    compute_capability(cc), iterator_algorithm(iterator_algorithm) { }
+
+  bool operator<(ConvPreferenceKey const &rhs) const {
+    return (compute_capability < rhs.compute_capability) || 
+      ((compute_capability == rhs.compute_capability) && (iterator_algorithm < rhs.iterator_algorithm));
+  }
+
+  bool operator==(ConvPreferenceKey const &rhs) const {
+    return (compute_capability == rhs.compute_capability) &&
+          (iterator_algorithm == rhs.iterator_algorithm);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Maps minimum compute capability onto a vector of possible operations
+using ConvOperationVectorMap = std::map<
+  ConvPreferenceKey,
+  std::vector<Operation const *>
+>;
+
+/// Maps a GemmFunctionalKey onto a vector of Operation * objects expected to be of kind kGemm
+using ConvOperationFunctionalMap = std::unordered_map<
+  ConvFunctionalKey,
+  ConvOperationVectorMap,
+  ConvFunctionalKeyHasher
+>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Tuple uniquely identifying conv2d functional behavior
+struct ReductionFunctionalKey {
+  library::Provider provider;
+  library::NumericTypeID element_workspace;
+  library::NumericTypeID element_accumulator;
+  library::NumericTypeID element_output;
+  library::NumericTypeID element_compute;
+  library::MathOperationID reduce_math_op;
+  library::EpilogueKind epilogue_math_op;
+
+
+  //
+  // Methods
+  //
+
+  inline
+  ReductionFunctionalKey(
+    library::Provider provider = library::Provider::kInvalid,
+    library::NumericTypeID element_workspace = library::NumericTypeID::kF16,
+    library::NumericTypeID element_accumulator = library::NumericTypeID::kF32,
+    library::NumericTypeID element_output = library::NumericTypeID::kF16,
+    library::NumericTypeID element_compute = library::NumericTypeID::kF32,
+    library::MathOperationID reduce_math_op = library::MathOperationID::kAdd,
+    library::EpilogueKind epilogue_math_op = library::EpilogueKind::kLinearCombination
+  ):
+    provider(provider),
+    element_workspace(element_workspace),
+    element_accumulator(element_accumulator),
+    element_output(element_output),
+    element_compute(element_compute),
+    reduce_math_op(reduce_math_op),
+    epilogue_math_op(epilogue_math_op)
+  { } 
+
+  inline 
+  bool operator==(ReductionFunctionalKey const &rhs) const {
+    return
+      (provider == rhs.provider) &&
+      (element_workspace == rhs.element_workspace) &&
+      (element_accumulator == rhs.element_accumulator) &&
+      (element_output == rhs.element_output) &&
+      (element_compute == rhs.element_compute) &&
+      (reduce_math_op == rhs.reduce_math_op) &&
+      (epilogue_math_op == rhs.epilogue_math_op);
+  }
+
+  inline 
+  bool operator!=(ReductionFunctionalKey const &rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+
+struct ReductionFunctionalKeyHasher {
+  using IntHash = std::hash<int>;
+
+  inline
+  static size_t rotl(size_t key, int shl) {
+    return (key << shl) | (key >> (sizeof(key)*8 - shl));
+  }
+
+  inline
+  size_t operator()(ReductionFunctionalKey const &key) const {
+    IntHash hash;
+
+    return 
+      rotl(hash(int(key.provider)), 1) ^
+      rotl(hash(int(key.element_workspace)), 2) ^
+      rotl(hash(int(key.element_accumulator)), 3) ^
+      rotl(hash(int(key.element_output)), 4) ^
+      rotl(hash(int(key.element_compute)), 5) ^
+      rotl(hash(int(key.reduce_math_op)), 6) ^
+      rotl(hash(int(key.epilogue_math_op)), 7);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline
+std::ostream& operator<< (std::ostream& out, const ReductionFunctionalKey& key) {
+    out << "{\n"
+      << "provider: " << library::to_string(key.provider) << std::endl
+      << "element_workspace   : " << library::to_string(key.element_workspace) << std::endl
+      << "element_accumulator : " << library::to_string(key.element_accumulator) << std::endl
+      << "element_output      : " << library::to_string(key.element_output) << std::endl
+      << "element_compute     : " << library::to_string(key.element_compute) << std::endl
+      << "}";
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// ReductionOperationFunctionalMap has NO preference key and a single instance per functional key
+// i.e. only one tile size configuration per functional key
+using ReductionOperationFunctionalMap = std::unordered_map<
+  ReductionFunctionalKey,
+  library::Operation const *,
+  ReductionFunctionalKeyHasher
+>;
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Table of cutlass::library::Operation instances
@@ -218,6 +474,18 @@ class OperationTable {
   // provider (kCUTLASS)
   GemmOperationFunctionalMap gemm_operations;
 
+  /// Map of all operations of type kConv2d 
+  // provider (kCUTLASS, kReferenceHost, kReferenceDevice)
+  ConvOperationFunctionalMap conv2d_operations;
+
+  /// Map of all operations of type kConv3d 
+  // provider (kCUTLASS, kReferenceHost, kReferenceDevice)
+  ConvOperationFunctionalMap conv3d_operations;
+
+  /// Map of all operations of type kConv2d 
+  // provider (kCUTLASS)
+  ReductionOperationFunctionalMap reduction_operations;
+
 public:
 
   void append(Manifest const &manifest);
diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h
index 526f836b2b..2e4a28c145 100644
--- a/tools/library/include/cutlass/library/util.h
+++ b/tools/library/include/cutlass/library/util.h
@@ -122,6 +122,27 @@ char const *to_string(SplitKMode split_k_mode, bool pretty = false);
 template <>
 SplitKMode from_string<SplitKMode>(std::string const &str);
 
+/// Converts a ConvModeID enumerant to a string
+char const *to_string(ConvModeID type, bool pretty = false);
+
+/// Converts a ConvModeID enumerant from a string
+template <>
+ConvModeID from_string<ConvModeID>(std::string const &str);
+
+/// Converts a IteratorAlgorithmID enumerant to a string
+char const *to_string(IteratorAlgorithmID type, bool pretty = false);
+
+/// Converts a IteratorAlgorithmID enumerant from a string
+template <>
+IteratorAlgorithmID from_string<IteratorAlgorithmID>(std::string const &str);
+
+/// Converts a ConvKind enumerant to a string
+char const *to_string(ConvKind type, bool pretty = false);
+
+/// Converts a ConvKind enumerant from a string
+template <>
+ConvKind from_string<ConvKind>(std::string const &str);
+
 /// Lexical cast from int64_t to string
 std::string lexical_cast(int64_t int_value);
 
diff --git a/tools/library/scripts/conv2d_operation.py b/tools/library/scripts/conv2d_operation.py
new file mode 100644
index 0000000000..e164bd007e
--- /dev/null
+++ b/tools/library/scripts/conv2d_operation.py
@@ -0,0 +1,344 @@
+#
+# \file generator.py
+#
+# \brief Generates the CUTLASS Library's instances
+#
+#
+
+import enum
+import os.path
+import shutil
+
+from library import *
+
+###################################################################################################
+
+#
+class Conv2dOperation:
+  #
+  def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B, C, element_epilogue, \
+    stride_support, epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
+
+    self.operation_kind = OperationKind.Conv2d
+    self.arch = arch
+    self.tile_description = tile_description
+    self.conv_kind = conv_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.iterator_algorithm = iterator_algorithm
+    self.stride_support = stride_support
+    self.swizzling_functor = swizzling_functor
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian
+      ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+  
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+    intermediate_type = ''
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.accumulator_type():
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    else:
+      inst_shape = ''
+
+    return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \
+      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${element_c}_${core_name}_${element_a}"
+    elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${core_name}_${element_a}"
+    else:
+      extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def layout_name(self):
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    
+    threadblock = "%dx%d_%dx%d" % (
+      self.tile_description.threadblock_shape[0],
+      self.tile_description.threadblock_shape[1],
+      self.tile_description.threadblock_shape[2],
+      self.tile_description.stages
+    )
+
+    if self.stride_support == StrideSupport.Unity:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride"
+    else:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}"
+
+    return SubstituteTemplate(
+      configuration_name,
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+      }
+    )
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.configuration_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+class EmitConv2dInstance:
+  def __init__(self):
+    self.template = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base = 
+  typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
+    ${element_a}, 
+    ${layout_a},
+    ${element_b}, 
+    ${layout_b},
+    ${element_c}, 
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator},
+    ${iterator_algorithm},
+    ${stride_support}
+  >::Kernel;
+"""
+
+
+  def emit(self, operation):
+
+    warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind': ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()], 
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
+      'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+      'stride_support': StrideSupportTag[operation.stride_support],
+      'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else \
+      MathOperationTag[operation.tile_description.math_instruction.math_operation] 
+    }
+
+    return SubstituteTemplate(self.template, values)
+
+###################################################################################################
+#
+# Generator functions for all layouts
+#
+###################################################################################################
+
+#
+def GenerateConv2dTensorOp(manifest, tile_descriptions, min_cc, align = 128):
+
+  for tile in tile_descriptions:
+    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
+
+      if conv_kind == ConvKind.Fprop or (tile.math_instruction.element_accumulator in [DataType.f16, DataType.f32]):
+
+        #
+        output_types = [tile.math_instruction.element_a, tile.math_instruction.element_accumulator] \
+          if DataTypeSize[tile.math_instruction.element_accumulator] == 32 \
+          else [tile.math_instruction.element_accumulator,]
+
+        for output_type in output_types:
+          A = TensorDescription(tile.math_instruction.element_a, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_a]))
+          B = TensorDescription(tile.math_instruction.element_b, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_b]))
+          C = TensorDescription(output_type,  LayoutType.TensorNHWC, max(1, int(align / DataTypeSize[output_type])))
+
+          manifest.append(Conv2dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator))
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitConv2dConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name)
+
+    self.instance_emitter = EmitConv2dInstance()
+
+    self.instance_template = """
+${operation_instance}
+
+// Derived class
+struct ${operation_name} : 
+  public ${operation_name}_base { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+    self.header_template = """
+/*
+  Generated by conv2d_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+#include "conv2d_operation.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+
+    self.configuration_header = """
+
+namespace cutlass {
+namespace library {
+
+// Initialize all instances
+void initialize_${configuration_name}(Manifest &manifest) {
+
+"""
+
+    self.configuration_instance = """
+  using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution<
+    ${operation_name}>;
+
+  manifest.append(new cutlass::library::Conv2dOperation<
+    Operation_${operation_name}>(
+      "${operation_name}"));
+
+"""
+
+    self.configuration_epilogue = """
+}
+"""
+    self.epilogue_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  #
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(SubstituteTemplate(self.header_template, {
+      'configuration_name': self.configuration_name
+      }))
+    self.operations = []
+    return self
+
+  #
+  def emit(self, operation):
+    self.operations.append(operation)
+    self.configuration_file.write(SubstituteTemplate(self.instance_template, {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'operation_instance': self.instance_emitter.emit(operation)
+      }))
+
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    self.configuration_file.write(SubstituteTemplate(self.configuration_header, {
+      'configuration_name': self.configuration_name
+      }))
+
+    for operation in self.operations:
+      self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
+        'configuration_name': self.configuration_name,
+        'operation_name': operation.procedural_name()  
+      }))
+
+    self.configuration_file.write(self.configuration_epilogue)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+
+###################################################################################################
+###################################################################################################
+
diff --git a/tools/library/scripts/conv3d_operation.py b/tools/library/scripts/conv3d_operation.py
new file mode 100644
index 0000000000..4ba31b0395
--- /dev/null
+++ b/tools/library/scripts/conv3d_operation.py
@@ -0,0 +1,321 @@
+#
+# \file generator.py
+#
+# \brief Generates the CUTLASS Library's instances
+#
+#
+
+import enum
+import os.path
+import shutil
+
+from library import *
+
+###################################################################################################
+
+#
+class Conv3dOperation:
+  #
+  def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B, C, element_epilogue, \
+    stride_support, epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
+
+    self.operation_kind = OperationKind.Conv3d
+    self.arch = arch
+    self.tile_description = tile_description
+    self.conv_kind = conv_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.iterator_algorithm = iterator_algorithm
+    self.stride_support = stride_support
+    self.swizzling_functor = swizzling_functor
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+    intermediate_type = ''
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    else:
+      inst_shape = ''
+
+    return "%s%s%s%s3d_%s" % (ShortDataTypeNames[self.tile_description.math_instruction.element_accumulator], \
+      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${element_c}_${core_name}_${element_a}"
+    elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${core_name}_${element_a}"
+    else:
+      extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    
+    threadblock = "%dx%d_%dx%d" % (
+      self.tile_description.threadblock_shape[0],
+      self.tile_description.threadblock_shape[1],
+      self.tile_description.threadblock_shape[2],
+      self.tile_description.stages
+    )
+
+    if self.stride_support == StrideSupport.Unity:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_unity_stride"
+    else:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}"
+
+    return SubstituteTemplate(
+      configuration_name,
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+      }
+    )
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.configuration_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+class EmitConv3dInstance:
+  def __init__(self):
+    self.template = """
+  // Conv3d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base = 
+  typename cutlass::conv::kernel::DefaultConv3d${conv_kind_name}<
+    ${element_a}, 
+    cutlass::layout::TensorNDHWC,
+    ${element_b}, 
+    cutlass::layout::TensorNDHWC,
+    ${element_c}, 
+    cutlass::layout::TensorNDHWC,
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    cutlass::arch::OpMultiplyAdd,
+    ${iterator_algorithm},
+    ${stride_support}
+  >::Kernel;
+"""
+
+
+  def emit(self, operation):
+
+    warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind': ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.tile_description.math_instruction.element_accumulator],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
+      'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+      'stride_support': StrideSupportTag[operation.stride_support]
+    }
+
+    return SubstituteTemplate(self.template, values)
+
+###################################################################################################
+#
+# Generator functions for all layouts
+#
+###################################################################################################
+
+#
+def GenerateConv3dTensorOp(manifest, tile_descriptions, min_cc, align = 128):
+
+  for tile in tile_descriptions:
+    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
+
+      if conv_kind == ConvKind.Fprop or (tile.math_instruction.element_accumulator in [DataType.f16, DataType.f32]):
+
+        #
+        output_types = [tile.math_instruction.element_a, tile.math_instruction.element_accumulator] \
+          if DataTypeSize[tile.math_instruction.element_accumulator] == 32 \
+          else [tile.math_instruction.element_accumulator,]
+
+        for output_type in output_types:
+          A = TensorDescription(tile.math_instruction.element_a, LayoutType.TensorNDHWC, int(align / DataTypeSize[tile.math_instruction.element_a]))
+          B = TensorDescription(tile.math_instruction.element_b, LayoutType.TensorNDHWC, int(align / DataTypeSize[tile.math_instruction.element_b]))
+          C = TensorDescription(output_type,  LayoutType.TensorNDHWC, max(1, int(align / DataTypeSize[output_type])))
+
+          manifest.append(Conv3dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator))
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitConv3dConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name)
+
+    self.instance_emitter = EmitConv3dInstance()
+
+    self.instance_template = """
+${operation_instance}
+
+// Derived class
+struct ${operation_name} : 
+  public ${operation_name}_base { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+    self.header_template = """
+/*
+  Generated by conv3d_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+#include "conv3d_operation.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+
+    self.configuration_header = """
+
+namespace cutlass {
+namespace library {
+
+// Initialize all instances
+void initialize_${configuration_name}(Manifest &manifest) {
+
+"""
+
+    self.configuration_instance = """
+  using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution<
+    ${operation_name}>;
+
+  manifest.append(new cutlass::library::Conv3dOperation<
+    Operation_${operation_name}>(
+      "${operation_name}"));
+
+"""
+
+    self.configuration_epilogue = """
+}
+"""
+    self.epilogue_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  #
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(SubstituteTemplate(self.header_template, {
+      'configuration_name': self.configuration_name
+      }))
+    self.operations = []
+    return self
+
+  #
+  def emit(self, operation):
+    self.operations.append(operation)
+    self.configuration_file.write(SubstituteTemplate(self.instance_template, {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'operation_instance': self.instance_emitter.emit(operation)
+      }))
+
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    self.configuration_file.write(SubstituteTemplate(self.configuration_header, {
+      'configuration_name': self.configuration_name
+      }))
+
+    for operation in self.operations:
+      self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
+        'configuration_name': self.configuration_name,
+        'operation_name': operation.procedural_name()  
+      }))
+
+    self.configuration_file.write(self.configuration_epilogue)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+
+###################################################################################################
+###################################################################################################
+
diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py
index f21acaaf6e..491997cb89 100644
--- a/tools/library/scripts/generator.py
+++ b/tools/library/scripts/generator.py
@@ -11,7 +11,6 @@
 
 from library import *
 from manifest import *
-from gemm_operation import *
 ###################################################################################################
 
 #
@@ -118,10 +117,9 @@ def CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, data_t
 
   gemm_kinds = [GemmKind.PlanarComplex, GemmKind.PlanarComplexArray]
   
-  # by default, only generate the largest tile and largest alignment
+  # by default, planar complex gemm kernels are not generated
   if manifest.args.kernels == '':
-    tile_descriptions = [tile_descriptions[0],]
-    alignment_constraints = [alignment_constraints[0],]
+    return
   
   for gemm_kind in gemm_kinds:
     for layout in layouts:
@@ -141,6 +139,103 @@ def CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, data_t
   return
 
 ###########################################################################################################
+#   ConvolutionOperator support variations
+#        ____________________________________________________________________
+#         ConvolutionalOperator |        Analytic      |      Optimized
+#        ____________________________________________________________________
+#        |       Fprop          |     (strided)        |    (strided)
+#        |       Dgrad          |   (strided, unity*)  |     (unity)
+#        |       Wgrad          |     (strided)        |    (strided)
+#        ____________________________________________________________________
+#
+# Note :  Operator marked (*) are supported but not generated to keep the instantiated kernel count low
+###########################################################################################################
+# Convolution for 2D operations
+def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment, \
+  conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination):
+  
+  element_a, element_b, element_c, element_epilogue = data_type
+  
+  # one exceptional case
+  alignment_c = min(8, alignment)
+  
+  # iterator algorithm (analytic and optimized)
+  iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
+
+  # by default, only generate the largest tile size
+  if manifest.args.kernels == '':
+    tile_descriptions = [tile_descriptions[0],]
+
+  operations = []
+
+  for tile in tile_descriptions:
+    for conv_kind in conv_kinds:
+      for iterator_algorithm in iterator_algorithms:
+        A = TensorDescription(element_a, layout[0], alignment)
+        B = TensorDescription(element_b, layout[1], alignment)
+        C = TensorDescription(element_c, layout[2], alignment_c)
+
+        # unity stride only for Optimized Dgrad
+        if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad):
+          new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\
+            A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+        # strided dgrad is not supported by Optimized Dgrad
+        if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad):
+          continue 
+
+        # strided support for Fprop (Analytic/Optimized), Dgrad (Analytic), and Wgrad (Analytic)
+        new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\
+         A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
+
+        manifest.append(new_operation)
+        operations.append(new_operation)
+
+  return operations
+
+# Convolution for 3D operations
+def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignment, \
+  conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination):
+  
+  element_a, element_b, element_c, element_epilogue = data_type
+  
+  # one exceptional case
+  alignment_c = min(8, alignment)
+  
+  # iterator algorithm (analytic and optimized)
+  iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
+
+  # by default, only generate the largest tile size
+  if manifest.args.kernels == '':
+    tile_descriptions = [tile_descriptions[0],]
+
+  operations = []
+
+  for tile in tile_descriptions:
+    for conv_kind in conv_kinds:
+      for iterator_algorithm in iterator_algorithms:
+        A = TensorDescription(element_a, layout, alignment)
+        B = TensorDescription(element_b, layout, alignment)
+        C = TensorDescription(element_c, layout, alignment_c)
+        
+        # optimized conv3d iterator algorithm is only for Wgrad
+        if (iterator_algorithm == IteratorAlgorithm.Optimized) \
+          and ((conv_kind == ConvKind.Fprop) or (conv_kind == ConvKind.Dgrad)):
+          continue
+
+        # strided support for Fprop (Analytic/Optimized), Dgrad (Analytic), and Wgrad (Analytic)
+        new_operation = Conv3dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\
+         A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
+
+        manifest.append(new_operation)
+        operations.append(new_operation)
+
+
+  return operations
+
 ###################################################################################################
 ###################################################################################################
 
@@ -191,11 +286,57 @@ def GenerateSM50_Simt(manifest, args):
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
+    if math_inst.element_a == DataType.f32:
+      conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+      CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 1)
+#
+
+#
+def GenerateSM50_Simt_complex(manifest, args):
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f32, DataType.f32, DataType.f32,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add_complex),
+  ]
+
+  min_cc = 50
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      DataType.cf32,
+      DataType.cf32,
+      DataType.cf32,
+      DataType.cf32,
+    ]
+
+    
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 1)
 #
 
 #
 def GenerateSM50(manifest, args):
   GenerateSM50_Simt(manifest, args)
+  GenerateSM50_Simt_complex(manifest, args)
 
 ###################################################################################################
 ###################################################################################################
@@ -362,6 +503,9 @@ def GenerateSM70_TensorOp_884(manifest, args):
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 8)
+
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
     if math_inst.element_a != math_inst.element_accumulator:
 
@@ -375,6 +519,8 @@ def GenerateSM70_TensorOp_884(manifest, args):
       CreateGemmOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints)
     
+      CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, 8)
+
 #
 def GenerateSM70_PlanarComplexTensorOp_884(manifest, args):
 
@@ -504,50 +650,10 @@ def GenerateSM70_WmmaTensorOp_161616(manifest, args):
 #
 ##################################################################################################
 #
-def GenerateSM70_Simt_complex(manifest, args):
-  math_instructions = [
-    MathInstruction(                                  \
-      [1, 1, 1],                                      \
-      DataType.f32, DataType.f32, DataType.f32,       \
-      OpcodeClass.Simt,                               \
-      MathOperation.multiply_add_complex),
-  ]
-
-  min_cc = 70
-  max_cc = 1024
-
-  alignment_constraints = [1,]
-  
-  for math_inst in math_instructions:
-    tile_descriptions = [
-      TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([64, 32, 16], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([32, 64, 16], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([32, 32, 16], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-    ]
-    data_type = [
-      DataType.cf32,
-      DataType.cf32,
-      DataType.cf32,
-      DataType.cf32
-    ]
-
-    complex_transforms = [
-      (ComplexTransform.none, ComplexTransform.none),
-      (ComplexTransform.conj, ComplexTransform.none),
-      (ComplexTransform.none, ComplexTransform.conj),
-      (ComplexTransform.conj, ComplexTransform.conj)
-    ]
-
-#
 
 def GenerateSM70(manifest, args):
   GenerateSM70_TensorOp_884(manifest, args)
   GenerateSM70_PlanarComplexTensorOp_884(manifest, args)
-  GenerateSM70_Simt_complex(manifest, args)
 
   # To limit build size, WMMA GEMMs are disabled for now.
   #
@@ -607,6 +713,9 @@ def GenerateSM75_TensorOp_1688(manifest, args):
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 8)
+
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
     if math_inst.element_a != math_inst.element_accumulator:
 
@@ -620,6 +729,8 @@ def GenerateSM75_TensorOp_1688(manifest, args):
       CreateGemmOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints)
 
+      CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, 8)
+
 #
 
 #
@@ -738,6 +849,10 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args):
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
 
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
     if math_inst.element_a != math_inst.element_accumulator:
 
@@ -753,6 +868,9 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args):
       operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
     
+      operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+        data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
       for op in operations:
         if op.tile_description.threadblock_shape[1] >= 128:
           op.C.alignment = 16
@@ -794,6 +912,8 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args):
       TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -809,9 +929,13 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args):
     operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
 
+#    conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
+#
+#    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+#      data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
     for op in operations:
       op.C.alignment = 8
-
 #
 
 #
@@ -862,6 +986,10 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args):
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
       
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
     if math_inst.element_a != math_inst.element_accumulator:
 
@@ -877,6 +1005,9 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args):
       operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
     
+      operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+        data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
       for op in operations:
         if op.tile_description.threadblock_shape[1] >= 128:
           op.C.alignment = 8
@@ -920,9 +1051,9 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args):
       TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128], 2, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
     ]
 
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
@@ -938,9 +1069,13 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args):
       operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
 
+#      conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
+#  
+#      operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+#        data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
       for op in operations:
         op.C.alignment = 16 
-
 #
 
 #
@@ -1074,6 +1209,8 @@ def GenerateSM75_Simt_complex(manifest, args):
       (ComplexTransform.conj, ComplexTransform.conj)
     ]
 
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 1)
 #
 
 def GenerateSM75(manifest, args):
@@ -1124,6 +1261,7 @@ def GenerateSM80_TensorOp_16816(manifest, args):
 
   min_cc = 80
   max_cc = 1024
+  max_cc_smem_limited = 80
 
   alignment_constraints = [8, 4, 2]
 
@@ -1137,10 +1275,10 @@ def GenerateSM80_TensorOp_16816(manifest, args):
       TileDescription([128,  64, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 256, 64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 64],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 256, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([ 64, 256, 64],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([128, 128, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -1157,6 +1295,10 @@ def GenerateSM80_TensorOp_16816(manifest, args):
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 8)
+    CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type, 8)
+
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
     if math_inst.element_a != math_inst.element_accumulator:
 
@@ -1170,6 +1312,8 @@ def GenerateSM80_TensorOp_16816(manifest, args):
       CreateGemmOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints)
 
+      CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, 8)
+      CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type_mixed, 8)
 #
 
 #
@@ -1205,22 +1349,23 @@ def GenerateSM80_SparseTensorOp_16832(manifest, args):
 
   min_cc = 80
   max_cc = 1024
+  max_cc_smem_limited = 80
 
   alignment_constraints = [8, 4, 2]
 
   for math_inst in math_instructions:
     tile_descriptions = [
-      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([128, 128,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([256,  64,  64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64,  64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
     ]
@@ -1348,6 +1493,7 @@ def GenerateSM80_TensorOp_16832_TN(manifest, args):
 
   min_cc = 80
   max_cc = 1024
+  max_cc_smem_limited = 80
 
   alignment_constraints = [16,]
 
@@ -1361,10 +1507,10 @@ def GenerateSM80_TensorOp_16832_TN(manifest, args):
       TileDescription([128,  64,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64,  64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -1382,6 +1528,13 @@ def GenerateSM80_TensorOp_16832_TN(manifest, args):
     operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
       
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+    
+    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+  
     for op in operations:
       if op.tile_description.threadblock_shape[1] >= 128:
         op.C.alignment = 16
@@ -1409,21 +1562,22 @@ def GenerateSM80_SparseTensorOp_16864_TN(manifest, args):
 
   min_cc = 80
   max_cc = 1024
+  max_cc_smem_limited = 80
 
   alignment_constraints = [16,]
 
   tile_descriptions = [
-    TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-    TileDescription([128, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
-    TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
     TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([ 64,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([128,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([128, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([ 64,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([128,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
     TileDescription([ 64,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
   ]
 
@@ -1489,10 +1643,14 @@ def GenerateSM80_TensorOp_16832_Interleaved(manifest, args):
   
     operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
-  
+ 
+#    conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
+#
+#    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+#      data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+ 
     for op in operations:
       op.C.alignment = 8
-  
 #
 
 #
@@ -1520,6 +1678,7 @@ def GenerateSM80_TensorOp_16864_TN(manifest, args):
 
   min_cc = 80
   max_cc = 1024
+  max_cc_smem_limited = 80
 
   alignment_constraints = [32,]
 
@@ -1533,14 +1692,14 @@ def GenerateSM80_TensorOp_16864_TN(manifest, args):
       TileDescription([128,  64, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 128], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 256],  4, [4, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256,  64, 256],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 128, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64, 256],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 256],  5, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
     ]
   
     data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32]
@@ -1582,20 +1741,21 @@ def GenerateSM80_SparseTensorOp_168128_TN(manifest, args):
 
   min_cc = 80
   max_cc = 1024
+  max_cc_smem_limited = 80
 
   alignment_constraints = [32,]
 
   tile_descriptions = [
-    TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([256,  64, 256],  3, [4, 1, 1], math_inst, min_cc, max_cc),
-    TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc),
-    TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([ 64, 128, 256],  6, [2, 2, 1], math_inst, min_cc, max_cc),
     TileDescription([ 64,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([128, 128, 512],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-    TileDescription([128,  64, 512],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([256,  64, 256],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([ 64, 128, 256],  6, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([128, 128, 512],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([128,  64, 512],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
     TileDescription([ 64, 128, 512],  3, [2, 2, 1], math_inst, min_cc, max_cc),
     TileDescription([ 64,  64, 512],  3, [2, 2, 1], math_inst, min_cc, max_cc),
   ]
@@ -1655,9 +1815,7 @@ def GenerateSM80_TensorOp_16864_Interleaved(manifest, args):
       TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64, 128], 10, [2, 2, 1], math_inst, min_cc, max_cc),
     ]
   
     data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
@@ -1666,7 +1824,12 @@ def GenerateSM80_TensorOp_16864_Interleaved(manifest, args):
   
     operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
-  
+ 
+#    conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
+#  
+#    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+#      data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+ 
     for op in operations:
       op.C.alignment = 16 
 #
@@ -1744,6 +1907,7 @@ def GenerateSM80_TensorOp_1688(manifest, args):
 
   min_cc = 80
   max_cc = 1024
+  max_cc_smem_limited = 80
 
   alignment_constraints = [4, 2, 1]
 
@@ -1757,11 +1921,11 @@ def GenerateSM80_TensorOp_1688(manifest, args):
       TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([64,  128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -1787,6 +1951,10 @@ def GenerateSM80_TensorOp_1688(manifest, args):
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type_mixed, alignment_constraints)
 
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 4)
+
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, 4)
 #
 
 #
@@ -1822,6 +1990,7 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, args):
 
   min_cc = 80
   max_cc = 1024
+  max_cc_smem_limited = 80
 
   alignment_constraints = [4, 2, 1]
 
@@ -1835,11 +2004,11 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, args):
       TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -1850,6 +2019,8 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, args):
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 4)
 #
 
 #
@@ -1875,22 +2046,23 @@ def GenerateSM80_SparseTensorOp_16816_fast_math(manifest, args):
 
   min_cc = 80
   max_cc = 1024
+  max_cc_smem_limited = 80
 
   alignment_constraints = [4, 2, 1]
 
   for math_inst in math_instructions:
     tile_descriptions = [
-      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([256,  64, 32],  3, [4, 1, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128,  64, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([ 64, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
     ]
@@ -1971,13 +2143,14 @@ def GenerateSM80_TensorOp_884(manifest, args):
 
   min_cc = 80
   max_cc = 1024
+  max_cc_smem_limited = 80
 
   alignment_constraints = [1,]
 
   tile_descriptions = [
-    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
     TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
     TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
     TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -2090,7 +2263,7 @@ def GenerateSM80_TensorOp_884_complex_gaussian(manifest, args):
 ###################################################################################################
 
 #
-def GenerateSM80_Simt(manifest, args):
+def GenerateSM80_Simt_f32(manifest, args):
   layouts = [
     (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
     (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
@@ -2136,8 +2309,55 @@ def GenerateSM80_Simt(manifest, args):
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 1)
 #
 
+
+#
+def GenerateSM80_Simt_f64(manifest, args):
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f64, DataType.f64, DataType.f64,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 128, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+    
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+#
+
+
 ##################################################################################################
 #
 def GenerateSM80_Simt_complex(manifest, args):
@@ -2154,7 +2374,29 @@ def GenerateSM80_Simt_complex(manifest, args):
 
   alignment_constraints = [1,]
 
+  data_type = [
+    DataType.cf32,
+    DataType.cf32,
+    DataType.cf32,
+    DataType.cf32
+  ]
+    
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
   for math_inst in math_instructions:
+
     tile_descriptions = [
       TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
@@ -2165,20 +2407,11 @@ def GenerateSM80_Simt_complex(manifest, args):
       TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
     ]
-    data_type = [
-      DataType.cf32,
-      DataType.cf32,
-      DataType.cf32,
-      DataType.cf32
-    ]
     
-    complex_transforms = [
-      (ComplexTransform.none, ComplexTransform.none),
-      (ComplexTransform.conj, ComplexTransform.none),
-      (ComplexTransform.none, ComplexTransform.conj),
-      (ComplexTransform.conj, ComplexTransform.conj)
-    ]
+    CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, alignment_constraints, complex_transforms)
 
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 1)
 #
 
 ###################################################################################################
@@ -2202,7 +2435,8 @@ def GenerateSM80(manifest, args):
   GenerateSM80_SparseTensorOp_168128_TN(manifest, args)
   GenerateSM80_TensorOp_16864_Interleaved(manifest, args)
   GenerateSM80_TensorOp_168256(manifest, args)
-  GenerateSM80_Simt(manifest, args)
+  GenerateSM80_Simt_f32(manifest, args)
+  GenerateSM80_Simt_f64(manifest, args)
   GenerateSM80_Simt_complex(manifest, args)
 
 ###################################################################################################
diff --git a/tools/library/scripts/library.py b/tools/library/scripts/library.py
index 2bb062da95..b9538cdbc5 100644
--- a/tools/library/scripts/library.py
+++ b/tools/library/scripts/library.py
@@ -71,6 +71,7 @@ class DataType(enum.Enum):
   cu16 = enum_auto()
   cu32 = enum_auto()
   cu64 = enum_auto()
+  invalid = enum_auto()
 
 #
 ShortDataTypeNames = {
@@ -260,6 +261,8 @@ class MathOperation(enum.Enum):
 class LayoutType(enum.Enum):
   ColumnMajor = enum_auto()
   RowMajor = enum_auto()
+  ColumnMajorInterleaved2 = enum_auto()
+  RowMajorInterleaved2 = enum_auto()
   ColumnMajorInterleaved32 = enum_auto()
   RowMajorInterleaved32 = enum_auto()
   ColumnMajorInterleaved64 = enum_auto()
@@ -268,13 +271,17 @@ class LayoutType(enum.Enum):
   TensorNDHWC = enum_auto()
   TensorNCHW = enum_auto()
   TensorNGHWC = enum_auto()
-  TensorNCxHW32 = enum_auto()
-  TensorNCxHW64 = enum_auto()
+  TensorNC32HW32 = enum_auto()
+  TensorNC64HW64 = enum_auto()
+  TensorC32RSK32 = enum_auto()
+  TensorC64RSK64 = enum_auto()
 
 #
 LayoutTag = {
   LayoutType.ColumnMajor: 'cutlass::layout::ColumnMajor',
   LayoutType.RowMajor: 'cutlass::layout::RowMajor',
+  LayoutType.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>',
+  LayoutType.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>',
   LayoutType.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>',
   LayoutType.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>',
   LayoutType.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>',
@@ -283,14 +290,18 @@ class LayoutType(enum.Enum):
   LayoutType.TensorNDHWC: 'cutlass::layout::TensorNDHWC',
   LayoutType.TensorNCHW: 'cutlass::layout::TensorNCHW',
   LayoutType.TensorNGHWC: 'cutlass::layout::TensorNGHWC',
-  LayoutType.TensorNCxHW32: 'cutlass::layout::TensorNCxHW32',
-  LayoutType.TensorNCxHW64: 'cutlass::layout::TensorNCxHW64'
+  LayoutType.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>',
+  LayoutType.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>',
+  LayoutType.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>',
+  LayoutType.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>',
 }
 
 #
 TransposedLayout = {
   LayoutType.ColumnMajor: LayoutType.RowMajor,
   LayoutType.RowMajor: LayoutType.ColumnMajor,
+  LayoutType.ColumnMajorInterleaved2: LayoutType.RowMajorInterleaved2,
+  LayoutType.RowMajorInterleaved2: LayoutType.ColumnMajorInterleaved2,
   LayoutType.ColumnMajorInterleaved32: LayoutType.RowMajorInterleaved32,
   LayoutType.RowMajorInterleaved32: LayoutType.ColumnMajorInterleaved32,
   LayoutType.ColumnMajorInterleaved64: LayoutType.RowMajorInterleaved64,
@@ -301,17 +312,21 @@ class LayoutType(enum.Enum):
 #
 ShortLayoutTypeNames = {
   LayoutType.ColumnMajor: 'n',
+  LayoutType.ColumnMajorInterleaved32: 'n2',
   LayoutType.ColumnMajorInterleaved32: 'n32',
   LayoutType.ColumnMajorInterleaved64: 'n64',
   LayoutType.RowMajor: 't',
+  LayoutType.RowMajorInterleaved2: 't2',
   LayoutType.RowMajorInterleaved32: 't32',
   LayoutType.RowMajorInterleaved64: 't64',
   LayoutType.TensorNHWC: 'nhwc',
   LayoutType.TensorNDHWC: 'ndhwc',
   LayoutType.TensorNCHW: 'nchw',
   LayoutType.TensorNGHWC: 'nghwc',
-  LayoutType.TensorNCxHW32: 'ncxhw32',
-  LayoutType.TensorNCxHW64: 'ncxhw64'
+  LayoutType.TensorNC32HW32: 'nc32hw32',
+  LayoutType.TensorNC64HW64: 'nc64hw64',
+  LayoutType.TensorC32RSK32: 'c32rsk32',
+  LayoutType.TensorC64RSK64: 'c64rsk64'
 }
 
 #
@@ -346,9 +361,14 @@ class OpcodeClass(enum.Enum):
 #
 class OperationKind(enum.Enum):
   Gemm = enum_auto()
+  Conv2d = enum_auto()        
+  Conv3d = enum_auto()        
+
 #
 OperationKindNames = {
   OperationKind.Gemm: 'gemm'
+  , OperationKind.Conv2d: 'conv2d'  
+  , OperationKind.Conv3d: 'conv3d' 
 }
 
 # 
@@ -424,6 +444,61 @@ class SwizzlingFunctor(enum.Enum):
   SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>',
   SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>',
 }
+
+###################################################################################################
+
+#
+class ConvKind(enum.Enum):
+  Fprop = enum_auto()
+  Dgrad = enum_auto()
+  Wgrad = enum_auto()
+
+#
+ConvKindTag = {
+  ConvKind.Fprop: 'cutlass::conv::Operator::kFprop',
+  ConvKind.Dgrad: 'cutlass::conv::Operator::kDgrad',
+  ConvKind.Wgrad: 'cutlass::conv::Operator::kWgrad'
+}
+
+ConvKindNames = {
+  ConvKind.Fprop: 'fprop',
+  ConvKind.Dgrad: 'dgrad',
+  ConvKind.Wgrad: 'wgrad',
+}
+
+#
+class IteratorAlgorithm(enum.Enum):
+  Analytic = enum_auto()
+  Optimized = enum_auto()
+
+#
+IteratorAlgorithmTag = {
+  IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
+  IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
+}
+
+IteratorAlgorithmNames = {
+  IteratorAlgorithm.Analytic: 'analytic',
+  IteratorAlgorithm.Optimized: 'optimized',
+}
+
+#
+class StrideSupport(enum.Enum):
+  Strided = enum_auto()
+  Unity = enum_auto()
+
+#
+StrideSupportTag = {
+  StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided',
+  StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity',
+}
+
+StrideSupportNames = {
+  StrideSupport.Strided: '',
+  StrideSupport.Unity: 'unity_stride',
+}
+
+
 ###################################################################################################
 
 #
diff --git a/tools/library/scripts/manifest.py b/tools/library/scripts/manifest.py
index 2f0aa24ecb..409ec09a27 100644
--- a/tools/library/scripts/manifest.py
+++ b/tools/library/scripts/manifest.py
@@ -10,6 +10,9 @@
 
 from library import *
 from gemm_operation import *
+from conv2d_operation import *  
+from conv3d_operation import *  
+
 ###################################################################################################
 
 class EmitOperationKindLibrary:
@@ -20,6 +23,8 @@ def __init__(self, generated_path, kind, args):
 
     self.emitters = {
       OperationKind.Gemm: EmitGemmConfigurationLibrary
+      , OperationKind.Conv2d: EmitConv2dConfigurationLibrary  
+      , OperationKind.Conv3d: EmitConv3dConfigurationLibrary  
     }
 
     self.configurations = [];
@@ -112,7 +117,10 @@ class Manifest:
   def __init__(self, args):
     self.operations = {}
     self.args = args
-    self.compute_capabilities = [int(x) for x in args.architectures.split(';')]
+
+    architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
+    self.compute_capabilities = [int(x) for x in architectures]
+    
     self.selected_kernels = []
     
     if args.operations == 'all':
@@ -121,6 +129,8 @@ def __init__(self, args):
 
       operations_list = [
         OperationKind.Gemm
+        , OperationKind.Conv2d    
+        , OperationKind.Conv3d    
       ] 
 
       self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
diff --git a/tools/library/src/conv2d_operation.h b/tools/library/src/conv2d_operation.h
new file mode 100644
index 0000000000..5e8f887fd1
--- /dev/null
+++ b/tools/library/src/conv2d_operation.h
@@ -0,0 +1,380 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines operations for all CONV operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/util/host_tensor.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/core_io.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class Conv2dOperationBase : public Operation {
+public:
+
+  using Operator = Operator_;
+
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = Operator::kIteratorAlgorithm;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  ConvDescription description_;
+
+public:
+
+  /// Constructor
+  Conv2dOperationBase(char const *name = "unknown_conv2d") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.kind = OperationKind::kConv2d;
+    description_.conv_dim = Operator::kConvDim;
+    
+    description_.iterator_algorithm = IteratorAlgorithmMap<Operator::kIteratorAlgorithm>::kId;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::ImplicitGemmKernel::WarpCount::kM,
+      Operator::ImplicitGemmKernel::WarpCount::kN,
+      Operator::ImplicitGemmKernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::MathOperator>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    // TODO: Add split k mode Serial and parallel to convolutions
+    // description_.split_k_mode = Operator::kSplitK ? SplitKMode::kSerial : SplitKMode::kNone;
+
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Conv2d library operation class for cutlass profiler
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Operator_>
+class Conv2dOperation : public Conv2dOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+    /// Constructor
+  Conv2dOperation(char const *name = "unknown_conv2d_fprop") : Conv2dOperationBase<Operator_>(name) {
+    this->description_.conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    Conv2dConfiguration const *configuration) {
+
+
+    operator_args.problem_size = configuration->problem_size;
+
+    operator_args.ref_A = 
+    {
+      nullptr, 
+      LayoutA::packed(implicit_gemm_tensor_a_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_B = 
+    {
+      nullptr, 
+      LayoutB::packed(implicit_gemm_tensor_b_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_C = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_D = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+
+    operator_args.split_k_mode = configuration->split_k_mode;
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    ConvArguments const *arguments) {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    operator_args.ref_A.reset(static_cast<ElementA *>(const_cast<void *>(arguments->A)));
+    operator_args.ref_B.reset(static_cast<ElementB *>(const_cast<void *>(arguments->B)));
+    operator_args.ref_C.reset(static_cast<ElementC *>(const_cast<void *>(arguments->C)));
+    operator_args.ref_D.reset(static_cast<ElementC *>(const_cast<void *>(arguments->D)));
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+
+    Conv2dConfiguration const *configuration = 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr);
+
+    ConvArguments const *arguments = 
+      static_cast<ConvArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    return Operator::get_workspace_size(args);
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    //std::cout << "initialize library::Conv2dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->initialize(args, device_workspace, stream);
+
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args, 
+      static_cast<ConvArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    //std::cout << "run library::Conv2dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->run(stream);
+  }
+
+  /// Call print_operator_args  from the Conv2dOperation::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "Conv2dOperation::OperatorArguments" << std::endl
+              << "  problem_size:" << std::endl 
+              << operator_args.problem_size << std::endl
+              << "  split_k_mode: "
+              << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl
+              << "  epilouge (alpha, beta): "
+              << operator_args.output_op.alpha << ", " 
+              << operator_args.output_op.beta << std::endl
+              << "  ref_A (ptr, {stride}): " 
+              << operator_args.ref_A.data() << ", {"
+              << operator_args.ref_A.stride(0) << ", " 
+              << operator_args.ref_A.stride(1) << ", " 
+              << operator_args.ref_A.stride(2) << "}" << std::endl
+              << "  ref_B (ptr, {stride}): " 
+              << operator_args.ref_B.data() << ", {"
+              << operator_args.ref_B.stride(0) << ", " 
+              << operator_args.ref_B.stride(1) << ", " 
+              << operator_args.ref_B.stride(2) << "}" << std::endl
+              << "  ref_C (ptr, {stride}): "
+              << operator_args.ref_C.data() << ", {"
+              << operator_args.ref_C.stride(0) << ", "
+              << operator_args.ref_C.stride(1) << ", " 
+              << operator_args.ref_C.stride(2) << "}" << std::endl
+              << "  ref_D (ptr, {stride}): "
+              << operator_args.ref_D.data() << ", {"
+              << operator_args.ref_D.stride(0) << ", "
+              << operator_args.ref_D.stride(1) << ", " 
+              << operator_args.ref_D.stride(2) << "}" << std::endl;
+  } 
+};
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/conv3d_operation.h b/tools/library/src/conv3d_operation.h
new file mode 100644
index 0000000000..32ad036320
--- /dev/null
+++ b/tools/library/src/conv3d_operation.h
@@ -0,0 +1,378 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines operations for all CONV operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/kernel/default_conv3d_dgrad.h"
+#include "cutlass/conv/kernel/default_conv3d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/util/host_tensor.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/core_io.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class Conv3dOperationBase : public Operation {
+public:
+
+  using Operator = Operator_;
+
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = Operator::kIteratorAlgorithm;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  ConvDescription description_;
+
+public:
+
+  /// Constructor
+  Conv3dOperationBase(char const *name = "unknown_conv3d") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.kind = OperationKind::kConv3d;
+    description_.conv_dim = Operator::kConvDim;
+    
+    description_.iterator_algorithm = IteratorAlgorithmMap<Operator::kIteratorAlgorithm>::kId;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::ImplicitGemmKernel::WarpCount::kM,
+      Operator::ImplicitGemmKernel::WarpCount::kN,
+      Operator::ImplicitGemmKernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Conv2d library operation class for cutlass profiler
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Operator_>
+class Conv3dOperation : public Conv3dOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+    /// Constructor
+  Conv3dOperation(char const *name = "unknown_conv3d_fprop") : Conv3dOperationBase<Operator_>(name) {
+    this->description_.conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    Conv3dConfiguration const *configuration) {
+
+
+    operator_args.problem_size     = configuration->problem_size;
+
+    operator_args.ref_A = 
+    {
+      nullptr, 
+      LayoutA::packed(implicit_gemm_tensor_a_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_B = 
+    {
+      nullptr, 
+      LayoutB::packed(implicit_gemm_tensor_b_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_C = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_D = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+
+    operator_args.split_k_mode     = configuration->split_k_mode;
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    ConvArguments const *arguments) {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    operator_args.ref_A.reset(static_cast<ElementA *>(const_cast<void *>(arguments->A)));
+    operator_args.ref_B.reset(static_cast<ElementB *>(const_cast<void *>(arguments->B)));
+    operator_args.ref_C.reset(static_cast<ElementC *>(const_cast<void *>(arguments->C)));
+    operator_args.ref_D.reset(static_cast<ElementC *>(const_cast<void *>(arguments->D)));
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+
+    Conv3dConfiguration const *configuration = 
+      static_cast<Conv3dConfiguration const *>(configuration_ptr);
+
+    ConvArguments const *arguments = 
+      static_cast<ConvArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv3dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    return Operator::get_workspace_size(args);
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv3dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    //std::cout << "initialize library::Conv3dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->initialize(args, device_workspace, stream);
+
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args, 
+      static_cast<ConvArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    //std::cout << "run library::Conv3dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->run(stream);
+  }
+
+  /// Call print_operator_args  from the Conv3dOperation::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "Conv3dOperation::OperatorArguments" << std::endl
+              << "  problem_size: " 
+              << operator_args.problem_size << std::endl
+              << "  split_k_mode: "
+              << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl
+              << "  epilouge (alpha, beta): " 
+              << operator_args.output_op.alpha << ", " 
+              << operator_args.output_op.beta << std::endl
+              << "  ref_A (ptr, {stride}): " 
+              << operator_args.ref_A.data() << ", {"
+              << operator_args.ref_A.stride(0) << ", " 
+              << operator_args.ref_A.stride(1) << ", " 
+              << operator_args.ref_A.stride(2) << ", " 
+              << operator_args.ref_A.stride(3) << "}" << std::endl
+              << "  ref_B (ptr, {stride}): " 
+              << operator_args.ref_B.data() << ", {"
+              << operator_args.ref_B.stride(0) << ", " 
+              << operator_args.ref_B.stride(1) << ", " 
+              << operator_args.ref_B.stride(2) << ", " 
+              << operator_args.ref_B.stride(3) << "}" << std::endl
+              << "  ref_C (ptr, {stride}): "
+              << operator_args.ref_C.data() << ", {"
+              << operator_args.ref_C.stride(0) << ", "
+              << operator_args.ref_C.stride(1) << ", " 
+              << operator_args.ref_C.stride(2) << ", " 
+              << operator_args.ref_C.stride(3) << "}" << std::endl
+              << "  ref_D (ptr, {stride}): "
+              << operator_args.ref_D.data() << ", {"
+              << operator_args.ref_D.stride(0) << ", "
+              << operator_args.ref_D.stride(1) << ", " 
+              << operator_args.ref_D.stride(2) << ", "
+              << operator_args.ref_D.stride(3) << "}" << std::endl;
+  } 
+};
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu
index bdddf2d7ca..3f19def654 100644
--- a/tools/library/src/handle.cu
+++ b/tools/library/src/handle.cu
@@ -1037,8 +1037,70 @@ Status Handle::gemm_planar_complex_array(
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Finds conv operation instances with Conv::ElementC = Reduction::ElementWorkspace
+Operation const* find_conv_operation_for_parallel_reduction(Operation const *operation) {
+
+  ConvDescription const &conv_desc = 
+    static_cast<ConvDescription const &>(operation->description());
+
+  // if the curren conv operation accumulator and output data type match return operation
+  if(conv_desc.tile_description.math_instruction.element_accumulator == conv_desc.C.element) {
+    return operation;
+  }
+
+  // find conv operation to match conv output and reduction workspace data type
+  ConvFunctionalKey key(
+    library::Provider::kCUTLASS,
+    conv_desc.conv_kind,        
+    conv_desc.A.element,
+    conv_desc.A.layout,
+    conv_desc.B.element,
+    conv_desc.B.layout,
+    conv_desc.tile_description.math_instruction.element_accumulator,
+    conv_desc.C.layout,
+    conv_desc.tile_description.math_instruction.element_accumulator, 
+    conv_desc.element_epilogue);
+
+  // conv operation table for conv2d or conv3d
+  auto conv_operations = (conv_desc.kind == OperationKind::kConv2d) ? 
+                          Singleton::get().operation_table.conv2d_operations : 
+                          Singleton::get().operation_table.conv3d_operations;
+
+  // find ConvFunctionalKey in convolution operation table
+  auto operators_it = conv_operations.find(key);
+
+  if (operators_it == conv_operations.end()) {
+    return nullptr;
+  }
+  
+  if (operators_it->second.empty()) {
+    return nullptr;
+  }
+
+  // conv operation for same compute capability and iterator algorithm
+  ConvPreferenceKey preference_key(
+    conv_desc.tile_description.minimum_compute_capability, 
+    conv_desc.iterator_algorithm);
+
+  auto it = operators_it->second.find(preference_key);
+  
+  if(it == operators_it->second.end()) {
+    return nullptr;
+  }
+
+  // return matching conv opertion (same tile sizes and instruction)
+  for (auto op : it->second) {
+    if (op->description().tile_description == operation->description().tile_description) {
+      return op;
+    }
+  }
+
+  return nullptr;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace library
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h
index 21190cc825..4bbd21c763 100644
--- a/tools/library/src/library_internal.h
+++ b/tools/library/src/library_internal.h
@@ -227,6 +227,23 @@ template <> struct LayoutMap<cutlass::layout::TensorNHWC> {
 template <> struct LayoutMap<cutlass::layout::TensorNDHWC> {
   static LayoutTypeID const kId = LayoutTypeID::kTensorNDHWC;
 };
+
+template <> struct LayoutMap<cutlass::layout::TensorNCxHWx<32>> {
+  static LayoutTypeID const kId = LayoutTypeID::kTensorNC32HW32;
+};
+
+template <> struct LayoutMap<cutlass::layout::TensorNCxHWx<64>> {
+  static LayoutTypeID const kId = LayoutTypeID::kTensorNC64HW64;
+};
+
+template <> struct LayoutMap<cutlass::layout::TensorCxRSKx<32>> {
+  static LayoutTypeID const kId = LayoutTypeID::kTensorC32RSK32;
+};
+
+template <> struct LayoutMap<cutlass::layout::TensorCxRSKx<64>> {
+  static LayoutTypeID const kId = LayoutTypeID::kTensorC64RSK64;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename T> struct OpcodeClassMap;
@@ -257,6 +274,43 @@ template <> struct ComplexTransformMap<cutlass::ComplexTransform::kConjugate> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <cutlass::conv::Mode  T> struct ConvModeMap;
+
+template <> struct ConvModeMap<conv::Mode::kCrossCorrelation> {
+  static ConvModeID const kId = ConvModeID::kCrossCorrelation;
+};
+
+template <> struct ConvModeMap<conv::Mode::kConvolution> {
+  static ConvModeID const kId = ConvModeID::kConvolution;
+};
+
+
+template <cutlass::conv::Operator  T> struct ConvKindMap;
+
+template <> struct ConvKindMap<conv::Operator::kFprop> {
+  static ConvKind const kId = ConvKind::kFprop;
+};
+
+template <> struct ConvKindMap<conv::Operator::kDgrad> {
+  static ConvKind const kId = ConvKind::kDgrad;
+};
+
+template <> struct ConvKindMap<conv::Operator::kWgrad> {
+  static ConvKind const kId = ConvKind::kWgrad;
+};
+
+
+template <cutlass::conv::IteratorAlgorithm  T> struct IteratorAlgorithmMap;
+
+template <> struct IteratorAlgorithmMap<conv::IteratorAlgorithm::kAnalytic> {
+  static IteratorAlgorithmID const kId = IteratorAlgorithmID::kAnalytic;
+};
+
+template <> struct IteratorAlgorithmMap<conv::IteratorAlgorithm::kOptimized> {
+  static IteratorAlgorithmID const kId = IteratorAlgorithmID::kOptimized;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 template <typename Element, typename Layout>
 TensorDescription make_TensorDescription(int alignment = 1) {
   TensorDescription desc;
diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp
index 29d2ef156f..12358dcdd3 100644
--- a/tools/library/src/manifest.cpp
+++ b/tools/library/src/manifest.cpp
@@ -36,6 +36,11 @@ namespace cutlass {
 namespace library {
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_reference_operations(Manifest &manifest);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Top-level initialization
 Status Manifest::initialize() {
 
@@ -46,6 +51,12 @@ Status Manifest::initialize() {
   // initialize procedurally generated cutlass op in manifest object
   initialize_all(*this);
 
+  // initialize manually instanced conv3d reference op in manifest object
+  initialize_reference_operations(*this);
+
+  // initialize manually instanced reduction reference op in manifest object
+  initialize_all_reduction_op(*this);
+
   return Status::kSuccess;
 }
 
diff --git a/tools/library/src/operation_table.cu b/tools/library/src/operation_table.cu
index 64e4f264cf..482bded851 100644
--- a/tools/library/src/operation_table.cu
+++ b/tools/library/src/operation_table.cu
@@ -76,6 +76,55 @@ void OperationTable::append(Manifest const &manifest) {
     }
 
 
+    // insert all conv2d or conv3d operation into operation table
+    if (desc.kind == OperationKind::kConv2d || desc.kind == OperationKind::kConv3d) {
+      auto &conv_desc = static_cast<library::ConvDescription const &>(desc);
+
+      ConvFunctionalKey functional_key(
+        conv_desc.provider,
+        conv_desc.conv_kind,
+        conv_desc.A.element,
+        conv_desc.A.layout,
+        conv_desc.B.element,
+        conv_desc.B.layout,
+        conv_desc.C.element,
+        conv_desc.C.layout,
+        conv_desc.tile_description.math_instruction.element_accumulator, 
+        conv_desc.element_epilogue
+      );
+
+      Operation const *op = operation.get();
+
+      int cc = conv_desc.tile_description.minimum_compute_capability;
+
+      ConvPreferenceKey preference_key(cc, conv_desc.iterator_algorithm);
+
+      // insert conv operation to conv2d_operations or conv3d_operations map
+      (desc.kind == OperationKind::kConv2d) ?
+        conv2d_operations[functional_key][preference_key].push_back(op) : 
+        conv3d_operations[functional_key][preference_key].push_back(op);
+    }
+
+    // insert all reduction operation into operation table
+    if (desc.kind == OperationKind::kReduction) {
+      auto &reduce_desc = static_cast<library::ReductionDescription const &>(desc);
+
+      ReductionFunctionalKey functional_key(
+        reduce_desc.provider,
+        reduce_desc.element_workspace,
+        reduce_desc.tile_description.math_instruction.element_accumulator,
+        reduce_desc.element_output,
+        reduce_desc.element_epilogue,
+        library::MathOperationID::kAdd,
+        library::EpilogueKind::kLinearCombination
+      );
+
+      Operation const *op = operation.get();
+
+      reduction_operations[functional_key] = op;
+
+    }
+
   }
 
 }
diff --git a/tools/library/src/reduction/init_reduction_operations.cu b/tools/library/src/reduction/init_reduction_operations.cu
new file mode 100644
index 0000000000..5f86b64f78
--- /dev/null
+++ b/tools/library/src/reduction/init_reduction_operations.cu
@@ -0,0 +1,57 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Initialize operations for reduction operation in CUTLASS Library.
+
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+namespace cutlass {
+namespace library {
+///////////////////////////////////////////////////////////////////////////////////////////////
+//                             CUTLASS Reduction Instances                                   //
+///////////////////////////////////////////////////////////////////////////////////////////////
+void initialize_reduce_add_linear_combination_f32_f32_f16(Manifest &manifest);
+void initialize_reduce_add_linear_combination_f32_f32_f32(Manifest &manifest);
+void initialize_reduce_add_linear_combination_cf32_cf32_cf32(Manifest &manifest);
+
+//
+// Entry point to construct operations
+//
+void initialize_all_reduction_op(Manifest &manifest) {
+
+  initialize_reduce_add_linear_combination_f32_f32_f16(manifest);
+  initialize_reduce_add_linear_combination_f32_f32_f32(manifest);
+  initialize_reduce_add_linear_combination_cf32_cf32_cf32(manifest);
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
diff --git a/tools/library/src/reduction/reduction_device.cu b/tools/library/src/reduction/reduction_device.cu
new file mode 100644
index 0000000000..e2133cc0a5
--- /dev/null
+++ b/tools/library/src/reduction/reduction_device.cu
@@ -0,0 +1,145 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for reduction operation in CUTLASS Library.
+*/
+  
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "reduction_operation.h"
+
+namespace cutlass {
+namespace library {
+
+// naming convention initialize_reduce_[ReductionOp]_[EpilogueOp]_[ElementWorkspace]_[ElementAccumulator]_[ElementOutput]
+
+void initialize_reduce_add_linear_combination_f32_f32_f16(Manifest &manifest) {
+
+  using ElementWorkspace = float; 
+  using ElementAccumulator = float;
+  using ElementOutput = cutlass::half_t;
+  using ElementCompute = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator, 
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using Operation_reduce_add_linear_combination_f32_f32_f16 = cutlass::reduction::device::ReduceSplitK<
+    cutlass::reduction::kernel::ReduceSplitK<
+      cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+      EpilogueOutputOp,
+      ReductionOp
+    >
+  >;
+
+  manifest.append(new ReductionOperation<
+    Operation_reduce_add_linear_combination_f32_f32_f16>(
+      "reduce_add_linear_combination_f32_f32_f16"
+  ));
+}
+
+
+void initialize_reduce_add_linear_combination_f32_f32_f32(Manifest &manifest) {
+
+  using ElementWorkspace = float; 
+  using ElementAccumulator = float;
+  using ElementOutput = float;
+  using ElementCompute = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator, 
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using Operation_reduce_add_linear_combination_f32_f32_f32 = cutlass::reduction::device::ReduceSplitK<
+    cutlass::reduction::kernel::ReduceSplitK<
+      cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+      EpilogueOutputOp,
+      ReductionOp
+    >
+  >;
+
+  manifest.append(new ReductionOperation<
+    Operation_reduce_add_linear_combination_f32_f32_f32>(
+      "reduce_add_linear_combination_f32_f32_f32"
+  ));
+}
+
+void initialize_reduce_add_linear_combination_cf32_cf32_cf32(Manifest &manifest) {
+
+  using ElementWorkspace = cutlass::complex<float>; 
+  using ElementAccumulator = cutlass::complex<float>;
+  using ElementOutput = cutlass::complex<float>;
+  using ElementCompute = cutlass::complex<float>;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator, 
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using Operation_reduce_add_linear_combination_cf32_cf32_cf32 = cutlass::reduction::device::ReduceSplitK<
+    cutlass::reduction::kernel::ReduceSplitK<
+      cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+      EpilogueOutputOp,
+      ReductionOp
+    >
+  >;
+
+  manifest.append(new ReductionOperation<
+    Operation_reduce_add_linear_combination_cf32_cf32_cf32>(
+      "reduce_add_linear_combination_cf32_cf32_cf32"
+  ));
+}
+
+
+} 
+}
diff --git a/tools/library/src/reduction/reduction_operation.h b/tools/library/src/reduction/reduction_operation.h
new file mode 100644
index 0000000000..88572ff684
--- /dev/null
+++ b/tools/library/src/reduction/reduction_operation.h
@@ -0,0 +1,282 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for reduction operation in CUTLASS Library.
+*/
+
+#pragma once
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/core_io.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class ReductionOperation : public Operation {
+public:
+  using Operator = Operator_;
+  
+  using ElementWorkspace = typename Operator::ElementWorkspace;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementOutput = typename Operator::ElementOutput;
+  
+  using ElementCompute = typename Operator::OutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  ReductionDescription description_;
+
+public:
+
+  /// Constructor
+  ReductionOperation(char const *name = "unknown_reduction") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.kind = OperationKind::kReduction;
+
+    description_.tile_description.threadblock_shape = make_Coord(Operator::Shape::kRow, Operator::Shape::kColumn, 1);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(1, 1, 1);
+    description_.tile_description.math_instruction.element_accumulator = NumericTypeMap<ElementAccumulator>::kId;
+    description_.tile_description.math_instruction.opcode_class = OpcodeClassID::kSimt;
+    description_.tile_description.math_instruction.math_operation = MathOperationID::kAdd;
+
+    description_.tile_description.minimum_compute_capability = 50;
+    description_.tile_description.maximum_compute_capability = 1024;
+
+    description_.element_workspace = NumericTypeMap<ElementWorkspace>::kId;
+    description_.element_output = NumericTypeMap<ElementOutput>::kId;
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+  }
+  
+  /// Returns the description of the Reduction operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    ReductionConfiguration const *configuration) {
+
+    operator_args.problem_size     = configuration->problem_size;
+    operator_args.partitions       = configuration->partitions;
+    operator_args.partition_stride = configuration->partition_stride;
+
+    operator_args.workspace        = {nullptr, int(configuration->ldw)};
+    operator_args.source           = {nullptr, int(configuration->lds)};
+    operator_args.destination      = {nullptr, int(configuration->ldd)};
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    ReductionArguments const *arguments) {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::OutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::OutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+    
+    operator_args.workspace.reset(static_cast<ElementWorkspace *>(const_cast<void *>(arguments->workspace)));
+    operator_args.source.reset(static_cast<ElementOutput *>(const_cast<void *>(arguments->source)));
+    operator_args.destination.reset(static_cast<ElementOutput *>(const_cast<void *>(arguments->destination)));
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+
+    ReductionConfiguration const *configuration = 
+      static_cast<ReductionConfiguration const *>(configuration_ptr);
+
+    ReductionArguments const *arguments = 
+      static_cast<ReductionArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<ReductionConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    return Operator::get_workspace_size(args);
+  }
+
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<ReductionConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    //std::cout << "initialize library::Reduction" << std::endl;
+    //print_operator_args(args);
+    return op->initialize(args, device_workspace, stream);
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args, 
+      static_cast<ReductionArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    //std::cout << "run library::Reduction" << std::endl;
+    //print_operator_args(args);
+    return op->run(stream);
+  }
+
+  /// Call print_operator_args  from the Reduction::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "Reduction::OperatorArguments" << std::endl
+              << "  problem_size: " 
+              << operator_args.problem_size << std::endl 
+              << "  partitions: " 
+              << operator_args.partitions << std::endl 
+              << "  partition_stride: " 
+              << operator_args.partition_stride << std::endl
+              << "  epilouge (alpha, beta): " 
+              << operator_args.output.alpha << ", " 
+              << operator_args.output.beta << std::endl
+              << "  workspace (ptr, stride): "
+              << operator_args.workspace.data() << ", " 
+              << operator_args.workspace.stride(0) << std::endl
+              << "  source (ptr, stride): " 
+              << operator_args.source.data() << ", " 
+              << operator_args.source.stride(0) << std::endl
+              << "  destination (ptr, stride): " 
+              << operator_args.destination.data() << ", " 
+              << operator_args.destination.stride(0) << std::endl;
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/reference/conv2d.cu b/tools/library/src/reference/conv2d.cu
new file mode 100644
index 0000000000..750ebdf31c
--- /dev/null
+++ b/tools/library/src/reference/conv2d.cu
@@ -0,0 +1,223 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief 
+
+*/
+  
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "conv_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_conv2d_reference_operations(Manifest &manifest) {
+
+  make_conv_all<
+    2,
+    cutlass::half_t, cutlass::layout::TensorNHWC,
+    cutlass::half_t, cutlass::layout::TensorNHWC,
+    cutlass::half_t, cutlass::layout::TensorNHWC,
+    cutlass::half_t,
+    cutlass::half_t
+  >(manifest);
+
+  make_conv_all<
+    2,
+    cutlass::half_t, cutlass::layout::TensorNHWC,
+    cutlass::half_t, cutlass::layout::TensorNHWC,
+    cutlass::half_t, cutlass::layout::TensorNHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    2,
+    cutlass::half_t, cutlass::layout::TensorNHWC,
+    cutlass::half_t, cutlass::layout::TensorNHWC,
+    float, cutlass::layout::TensorNHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    2,
+    cutlass::bfloat16_t, cutlass::layout::TensorNHWC,
+    cutlass::bfloat16_t, cutlass::layout::TensorNHWC,
+    cutlass::bfloat16_t, cutlass::layout::TensorNHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    2,
+    cutlass::bfloat16_t, cutlass::layout::TensorNHWC,
+    cutlass::bfloat16_t, cutlass::layout::TensorNHWC,
+    float, cutlass::layout::TensorNHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    2,
+    cutlass::tfloat32_t, cutlass::layout::TensorNHWC,
+    cutlass::tfloat32_t, cutlass::layout::TensorNHWC,
+    cutlass::tfloat32_t, cutlass::layout::TensorNHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    2,
+    cutlass::tfloat32_t, cutlass::layout::TensorNHWC,
+    cutlass::tfloat32_t, cutlass::layout::TensorNHWC,
+    float, cutlass::layout::TensorNHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    2,
+    float, cutlass::layout::TensorNHWC,
+    float, cutlass::layout::TensorNHWC,
+    float, cutlass::layout::TensorNHWC,
+    float,
+    float
+  >(manifest);
+  
+  make_conv_all<
+    2,
+    cutlass::complex<float>, cutlass::layout::TensorNHWC,
+    cutlass::complex<float>, cutlass::layout::TensorNHWC,
+    cutlass::complex<float>, cutlass::layout::TensorNHWC,
+    cutlass::complex<float>,
+    cutlass::complex<float>
+  >(manifest);
+  
+  make_conv_fprop<
+    2,
+    int8_t, cutlass::layout::TensorNHWC,
+    int8_t, cutlass::layout::TensorNHWC,
+    int32_t, cutlass::layout::TensorNHWC,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, int32_t>
+  >(manifest);
+
+  make_conv_fprop<
+    2,
+    int8_t, cutlass::layout::TensorNHWC,
+    int8_t, cutlass::layout::TensorNHWC,
+    int8_t, cutlass::layout::TensorNHWC,
+    float,
+    int32_t,
+    NumericConverterClamp<int8_t, float>
+  >(manifest);
+
+  make_conv_fprop<
+    2,
+    uint8_t, cutlass::layout::TensorNHWC,
+    uint8_t, cutlass::layout::TensorNHWC,
+    uint8_t, cutlass::layout::TensorNHWC,
+    float,
+    int32_t,
+    NumericConverterClamp<uint8_t, float>
+  >(manifest);
+
+  make_conv_fprop<
+    2,
+    uint8_t, cutlass::layout::TensorNHWC,
+    uint8_t, cutlass::layout::TensorNHWC,
+    int32_t, cutlass::layout::TensorNHWC,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, int32_t>
+  >(manifest);
+
+  make_conv_fprop<
+    2,
+    uint8_t, cutlass::layout::TensorNHWC,
+    uint8_t, cutlass::layout::TensorNHWC,
+    int8_t, cutlass::layout::TensorNHWC,
+    float,
+    int32_t,
+    NumericConverterClamp<int8_t, float>
+  >(manifest);
+
+  make_conv_fprop<
+    2,
+    cutlass::int4b_t, cutlass::layout::TensorNHWC,
+    cutlass::int4b_t, cutlass::layout::TensorNHWC,
+    int32_t, cutlass::layout::TensorNHWC,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, int32_t>
+  >(manifest);
+
+  make_conv_fprop<
+    2,
+    cutlass::int4b_t, cutlass::layout::TensorNHWC,
+    cutlass::int4b_t, cutlass::layout::TensorNHWC,
+    cutlass::int4b_t, cutlass::layout::TensorNHWC,
+    float,
+    int32_t,
+    NumericConverterClamp<cutlass::int4b_t, float>
+  >(manifest);
+
+  make_conv_fprop<
+    2,
+    cutlass::uint4b_t, cutlass::layout::TensorNHWC,
+    cutlass::uint4b_t, cutlass::layout::TensorNHWC,
+    int32_t, cutlass::layout::TensorNHWC,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, int32_t>
+  >(manifest);
+
+  make_conv_fprop<
+    2,
+    cutlass::uint4b_t, cutlass::layout::TensorNHWC,
+    cutlass::uint4b_t, cutlass::layout::TensorNHWC,
+    cutlass::uint4b_t, cutlass::layout::TensorNHWC,
+    float,
+    int32_t,
+    NumericConverterClamp<cutlass::uint4b_t, float>
+  >(manifest);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library 
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/conv3d.cu b/tools/library/src/reference/conv3d.cu
new file mode 100644
index 0000000000..1e1544bff6
--- /dev/null
+++ b/tools/library/src/reference/conv3d.cu
@@ -0,0 +1,203 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief 
+*/
+  
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "conv_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_conv3d_reference_operations(Manifest &manifest) {
+
+  make_conv_all<
+    3,
+    cutlass::half_t, cutlass::layout::TensorNDHWC,
+    cutlass::half_t, cutlass::layout::TensorNDHWC,
+    cutlass::half_t, cutlass::layout::TensorNDHWC,
+    cutlass::half_t,
+    cutlass::half_t
+  >(manifest);
+
+  make_conv_all<
+    3,
+    cutlass::half_t, cutlass::layout::TensorNDHWC,
+    cutlass::half_t, cutlass::layout::TensorNDHWC,
+    cutlass::half_t, cutlass::layout::TensorNDHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    3,
+    cutlass::half_t, cutlass::layout::TensorNDHWC,
+    cutlass::half_t, cutlass::layout::TensorNDHWC,
+    float, cutlass::layout::TensorNDHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    3,
+    cutlass::bfloat16_t, cutlass::layout::TensorNDHWC,
+    cutlass::bfloat16_t, cutlass::layout::TensorNDHWC,
+    cutlass::bfloat16_t, cutlass::layout::TensorNDHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    3,
+    cutlass::bfloat16_t, cutlass::layout::TensorNDHWC,
+    cutlass::bfloat16_t, cutlass::layout::TensorNDHWC,
+    float, cutlass::layout::TensorNDHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    3,
+    cutlass::tfloat32_t, cutlass::layout::TensorNDHWC,
+    cutlass::tfloat32_t, cutlass::layout::TensorNDHWC,
+    cutlass::tfloat32_t, cutlass::layout::TensorNDHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    3,
+    cutlass::tfloat32_t, cutlass::layout::TensorNDHWC,
+    cutlass::tfloat32_t, cutlass::layout::TensorNDHWC,
+    float, cutlass::layout::TensorNDHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_all<
+    3,
+    float, cutlass::layout::TensorNDHWC,
+    float, cutlass::layout::TensorNDHWC,
+    float, cutlass::layout::TensorNDHWC,
+    float,
+    float
+  >(manifest);
+
+  make_conv_fprop<
+    3,
+    int8_t, cutlass::layout::TensorNDHWC,
+    int8_t, cutlass::layout::TensorNDHWC,
+    int32_t, cutlass::layout::TensorNDHWC,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, int32_t>
+  >(manifest);
+
+  make_conv_fprop<
+    3,
+    int8_t, cutlass::layout::TensorNDHWC,
+    int8_t, cutlass::layout::TensorNDHWC,
+    int8_t, cutlass::layout::TensorNDHWC,
+    float,
+    int32_t,
+    NumericConverterClamp<int8_t, float>
+  >(manifest);
+
+  make_conv_fprop<
+    3,
+    uint8_t, cutlass::layout::TensorNDHWC,
+    uint8_t, cutlass::layout::TensorNDHWC,
+    int32_t, cutlass::layout::TensorNDHWC,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, int32_t>
+  >(manifest);
+
+  make_conv_fprop<
+    3,
+    uint8_t, cutlass::layout::TensorNDHWC,
+    uint8_t, cutlass::layout::TensorNDHWC,
+    int8_t, cutlass::layout::TensorNDHWC,
+    float,
+    int32_t,
+    NumericConverterClamp<int8_t, float>
+  >(manifest);
+
+  make_conv_fprop<
+    3,
+    cutlass::int4b_t, cutlass::layout::TensorNDHWC,
+    cutlass::int4b_t, cutlass::layout::TensorNDHWC,
+    int32_t, cutlass::layout::TensorNDHWC,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, int32_t>
+  >(manifest);
+
+  make_conv_fprop<
+    3,
+    cutlass::int4b_t, cutlass::layout::TensorNDHWC,
+    cutlass::int4b_t, cutlass::layout::TensorNDHWC,
+    cutlass::int4b_t, cutlass::layout::TensorNDHWC,
+    float,
+    int32_t,
+    NumericConverterClamp<cutlass::int4b_t, float>
+  >(manifest);
+  
+  make_conv_fprop<
+    3,
+    cutlass::uint4b_t, cutlass::layout::TensorNDHWC,
+    cutlass::uint4b_t, cutlass::layout::TensorNDHWC,
+    int32_t, cutlass::layout::TensorNDHWC,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, int32_t>
+  >(manifest);
+
+  make_conv_fprop<
+    3,
+    cutlass::uint4b_t, cutlass::layout::TensorNDHWC,
+    cutlass::uint4b_t, cutlass::layout::TensorNDHWC,
+    cutlass::uint4b_t, cutlass::layout::TensorNDHWC,
+    float,
+    int32_t,
+    NumericConverterClamp<cutlass::uint4b_t, float>
+  >(manifest);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library 
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/conv_reference_operation.h b/tools/library/src/reference/conv_reference_operation.h
new file mode 100644
index 0000000000..1e826ab29e
--- /dev/null
+++ b/tools/library/src/reference/conv_reference_operation.h
@@ -0,0 +1,607 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines operations for all CONV operation kinds in CUTLASS Library
+*/
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/util.h"
+#include "library_internal.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  Provider kProvider,
+  conv::Operator ConvolutionalOperator,
+  int ConvDim,
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+struct ConvReferenceDispatcher;
+
+/// Dispatcher for Conv2d (partially specialied for kConvDim == 2)
+template <
+  Provider kProvider,
+  conv::Operator kConvolutionalOperator,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator,
+  typename ConvertOp,
+  typename InnerProductOp
+>
+struct ConvReferenceDispatcher<
+  kProvider,
+  kConvolutionalOperator, 
+  2, 
+  ElementA, LayoutA, 
+  ElementB, LayoutB, 
+  ElementC, LayoutC, 
+  ElementCompute, 
+  ElementAccumulator, 
+  ConvertOp, 
+  InnerProductOp> {
+
+  static Status dispatch(
+    void const *configuration,
+    ElementA *ptr_A,
+    ElementB *ptr_B,
+    ElementC *ptr_C,
+    ElementC *ptr_D,
+    ElementCompute alpha,
+    ElementCompute beta,
+    cudaStream_t stream = nullptr
+  ) {
+
+    Conv2dConfiguration const &config = 
+      *static_cast<Conv2dConfiguration const *>(configuration);
+
+    ConvKind const conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+
+    if (kProvider == Provider::kReferenceHost) {
+
+      cutlass::reference::host::Conv2d<
+        ElementA,
+        LayoutA,
+        ElementB,
+        LayoutB,
+        ElementC ,
+        LayoutC,
+        ElementCompute,
+        ElementAccumulator,
+        ConvertOp,
+        InnerProductOp
+      >(
+        kConvolutionalOperator,
+        config.problem_size,
+        {ptr_A, config.layout_a(conv_kind)},
+        {ptr_B, config.layout_b(conv_kind)},
+        {ptr_C, config.layout_c(conv_kind)},
+        {ptr_D, config.layout_c(conv_kind)},
+        alpha,
+        beta
+      );
+
+      return Status::kSuccess;
+    }
+    else if (kProvider == Provider::kReferenceDevice) {
+      return cutlass::reference::device::Conv2d<
+        ElementA,
+        LayoutA,
+        ElementB,
+        LayoutB,
+        ElementC,
+        LayoutC,
+        ElementCompute,
+        ElementAccumulator,
+        ConvertOp,
+        InnerProductOp
+      >(
+        kConvolutionalOperator,
+        config.problem_size,
+        {ptr_A, config.layout_a(conv_kind)},
+        {ptr_B, config.layout_b(conv_kind)},
+        {ptr_C, config.layout_c(conv_kind)},
+        {ptr_D, config.layout_c(conv_kind)},
+        alpha,
+        beta,
+        stream
+      );
+    }
+    return Status::kErrorNotSupported;
+  }
+};
+
+/// Dispatcher for Conv3d (partially specialized for kConvDim == 3)
+template <
+  Provider kProvider,
+  conv::Operator kConvolutionalOperator,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator,
+  typename ConvertOp,
+  typename InnerProductOp
+>
+struct ConvReferenceDispatcher<
+  kProvider,
+  kConvolutionalOperator, 
+  3, 
+  ElementA, LayoutA, 
+  ElementB, LayoutB, 
+  ElementC, LayoutC, 
+  ElementCompute, 
+  ElementAccumulator, 
+  ConvertOp, 
+  InnerProductOp> {
+
+  static Status dispatch(
+    void const *configuration,
+    ElementA *ptr_A,
+    ElementB *ptr_B,
+    ElementC *ptr_C,
+    ElementC *ptr_D,
+    ElementCompute alpha,
+    ElementCompute beta,
+    cudaStream_t stream = nullptr
+  ) {
+
+    Conv3dConfiguration const &config = 
+      *static_cast<Conv3dConfiguration const *>(configuration);
+    
+    ConvKind const conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+
+    if (kProvider == Provider::kReferenceHost) {
+      cutlass::reference::host::Conv3d<
+        ElementA,
+        LayoutA,
+        ElementB,
+        LayoutB,
+        ElementC ,
+        LayoutC,
+        ElementCompute,
+        ElementAccumulator,
+        ConvertOp,
+        InnerProductOp
+      >(
+        kConvolutionalOperator,
+        config.problem_size,
+        {ptr_A, config.layout_a(conv_kind)},
+        {ptr_B, config.layout_b(conv_kind)},
+        {ptr_C, config.layout_c(conv_kind)},
+        {ptr_D, config.layout_c(conv_kind)},
+        alpha,
+        beta
+      );
+
+      return Status::kSuccess;
+    }
+    else if (kProvider == Provider::kReferenceDevice) {
+      return cutlass::reference::device::Conv3d<
+        ElementA,
+        LayoutA,
+        ElementB,
+        LayoutB,
+        ElementC,
+        LayoutC,
+        ElementCompute,
+        ElementAccumulator,
+        ConvertOp,
+        InnerProductOp
+      >(
+        kConvolutionalOperator,
+        config.problem_size,
+        {ptr_A, config.layout_a(conv_kind)},
+        {ptr_B, config.layout_b(conv_kind)},
+        {ptr_C, config.layout_c(conv_kind)},
+        {ptr_D, config.layout_c(conv_kind)},
+        alpha,
+        beta,
+        stream
+      );
+    }
+    return Status::kErrorNotSupported;
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  Provider Provider_,
+  conv::Operator ConvolutionalOperator,
+  int ConvDim,
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+class ConvReferenceOperation : public Operation {
+public:
+  static Provider const kProvider = Provider_;
+  static conv::Operator const kConvolutionalOperator = ConvolutionalOperator;
+  static int const kConvDim = ConvDim;
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ConvertOp = ConvertOp_;
+  using InnerProductOp = InnerProductOp_;
+
+protected:
+
+  /// Storage for the name string
+  std::string name_;
+
+  ///
+  ConvDescription description_;
+
+public:
+
+  /// Constructor
+  ConvReferenceOperation() {
+    
+    // Basic information
+    description_.provider = kProvider;
+    description_.kind = (kConvDim == 2 ? OperationKind::kConv2d : OperationKind::kConv3d);
+    description_.conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+    description_.conv_dim = kConvDim;
+
+    // Tensor description
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    // Iterator algorithm for convolution reference
+    description_.iterator_algorithm = IteratorAlgorithmID::kNone;
+    
+    // Compute capability for convolution reference
+    description_.tile_description.minimum_compute_capability = 
+      (kProvider == Provider::kReferenceDevice ? 50 : 0);
+
+    description_.tile_description.maximum_compute_capability = 1024;
+
+    // Procedural name
+    std::stringstream ss;
+
+    ss << "conv" << kConvDim << "d_" << to_string(description_.conv_kind) 
+      << "_reference_" << to_string(description_.provider)
+      << "_" << to_string(description_.A.element) << to_string(description_.A.layout)
+      << "_" << to_string(description_.B.element) << to_string(description_.B.layout)
+      << "_" << to_string(description_.C.element) << to_string(description_.C.layout)
+      << "_" << to_string(description_.tile_description.math_instruction.element_accumulator);
+
+    name_ = ss.str();
+
+    description_.name = name_.c_str();
+
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+  virtual Status can_implement(
+    void const *configuration,
+    void const *arguments) const {
+
+    return Status::kSuccess;
+  }
+
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    switch (kConvDim) {
+    case 2:
+      return sizeof(Conv2dConfiguration);
+    case 3:
+      return sizeof(Conv3dConfiguration);
+    default:
+      break;
+    }
+
+    return 0;
+  }
+
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration) const {
+
+    return 0;
+  }
+
+  virtual Status initialize(
+    void const *configuration,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+
+    std::memcpy(host_workspace, configuration, get_host_workspace_size(configuration));
+
+    return Status::kSuccess;
+  }
+
+  virtual Status run(
+    void const *arguments,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+
+    ConvArguments const  &args = *static_cast<ConvArguments const *>(arguments);
+
+    ElementCompute alpha;
+    ElementCompute beta;
+
+    alpha = *static_cast<ElementCompute const *>(args.alpha);
+    beta = *static_cast<ElementCompute const *>(args.beta);
+
+    // TODO - respect pointer mode
+
+    // Invoke 2D or 3D convolution
+    return detail::ConvReferenceDispatcher<
+      kProvider,
+      kConvolutionalOperator,
+      kConvDim,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp,
+      InnerProductOp
+    >::dispatch(
+      host_workspace,
+      static_cast<ElementA *>(const_cast<void *>(args.A)),
+      static_cast<ElementB *>(const_cast<void *>(args.B)),
+      static_cast<ElementC *>(const_cast<void *>(args.C)),
+      static_cast<ElementC *>(args.D),
+      alpha,
+      beta,
+      stream
+    );
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Constructs Fprop reference operators.
+template <
+  int kConvDim,
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_conv_fprop(Manifest &manifest) {
+  
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceHost,
+    conv::Operator::kFprop,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceDevice,
+    conv::Operator::kFprop,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+}
+
+/// Constructs Dgrad and Wgrad reference operators.
+template <
+  int kConvDim,
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_conv_backwards(Manifest &manifest) {
+  
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceHost,
+    conv::Operator::kDgrad,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceDevice,
+    conv::Operator::kDgrad,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceHost,
+    conv::Operator::kWgrad,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceDevice,
+    conv::Operator::kWgrad,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+}
+
+/// Six operators for the price of one.
+template <
+  int kConvDim,
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_conv_all(Manifest &manifest) {
+
+  make_conv_fprop<
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  make_conv_backwards<
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/initialize_reference_operations.cu b/tools/library/src/reference/initialize_reference_operations.cu
index 016d91a6f2..c749c2bca9 100644
--- a/tools/library/src/reference/initialize_reference_operations.cu
+++ b/tools/library/src/reference/initialize_reference_operations.cu
@@ -37,10 +37,14 @@ namespace cutlass {
 namespace library {
 
 void initialize_gemm_reference_operations(Manifest &manifest);
+void initialize_conv2d_reference_operations(Manifest &manifest);
+void initialize_conv3d_reference_operations(Manifest &manifest);
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 void initialize_reference_operations(Manifest &manifest) {
+  initialize_conv2d_reference_operations(manifest);
+  initialize_conv3d_reference_operations(manifest);
   initialize_gemm_reference_operations(manifest);
 }
 
diff --git a/tools/library/src/util.cu b/tools/library/src/util.cu
index 13fb9dfc0a..b20f505425 100644
--- a/tools/library/src/util.cu
+++ b/tools/library/src/util.cu
@@ -50,6 +50,7 @@ Provider_enumerants[] = {
   {"host", "reference_host", Provider::kReferenceHost},
   {"device", "reference_device", Provider::kReferenceDevice},
   {"cublas", "cuBLAS", Provider::kCUBLAS},
+  {"cudnn", "cuDNN", Provider::kCUDNN},                           
 };
 
 /// Converts a Provider enumerant to a string
@@ -128,6 +129,9 @@ static struct {
 OperationKind_enumerants[] = {
   {"eq_gemm", "EqGemm", OperationKind::kEqGemm}, 
   {"gemm", "Gemm", OperationKind::kGemm},               
+  {"conv2d", "Conv2d", OperationKind::kConv2d},           
+  {"conv3d", "Conv3d", OperationKind::kConv3d},           
+  {"spgemm", "SparseGemm", OperationKind::kSparseGemm},
 };
 
 /// Converts a Status enumerant to a string
@@ -445,6 +449,10 @@ layout_aliases[] = {
   {LayoutTypeID::kTensorNCDHW, "ncdhw"},
   {LayoutTypeID::kTensorNHWC, "nhwc"},
   {LayoutTypeID::kTensorNDHWC, "ndhwc"},
+  {LayoutTypeID::kTensorNC32HW32, "nc32hw32"},
+  {LayoutTypeID::kTensorNC64HW64, "nc64hw64"},
+  {LayoutTypeID::kTensorC32RSK32, "c32rsk32"},
+  {LayoutTypeID::kTensorC64RSK64, "c64rsk64"},
 
   {LayoutTypeID::kUnknown, "*"},
   {LayoutTypeID::kInvalid, nullptr}
@@ -474,22 +482,46 @@ LayoutTypeID from_string<LayoutTypeID>(std::string const &str) {
 /// Gets stride rank for the layout_id (static function)
 int get_layout_stride_rank(LayoutTypeID layout_id) {
   switch (layout_id) {
-    case LayoutTypeID::kColumnMajor: return cutlass::layout::ColumnMajor::kStrideRank;
-    case LayoutTypeID::kRowMajor:  return cutlass::layout::RowMajor::kStrideRank;
+    case LayoutTypeID::kColumnMajor:
+      return cutlass::layout::ColumnMajor::kStrideRank;
+    case LayoutTypeID::kRowMajor:
+      return cutlass::layout::RowMajor::kStrideRank;
     case LayoutTypeID::kColumnMajorInterleavedK2:
+      return cutlass::layout::ColumnMajorInterleaved<2>::kStrideRank;
     case LayoutTypeID::kRowMajorInterleavedK2:
+      return cutlass::layout::RowMajorInterleaved<2>::kStrideRank;
     case LayoutTypeID::kColumnMajorInterleavedK4:
+      return cutlass::layout::ColumnMajorInterleaved<4>::kStrideRank;
     case LayoutTypeID::kRowMajorInterleavedK4:
+      return cutlass::layout::RowMajorInterleaved<4>::kStrideRank;
     case LayoutTypeID::kColumnMajorInterleavedK16:
+      return cutlass::layout::ColumnMajorInterleaved<16>::kStrideRank;
     case LayoutTypeID::kRowMajorInterleavedK16:
+      return cutlass::layout::RowMajorInterleaved<16>::kStrideRank;
     case LayoutTypeID::kColumnMajorInterleavedK32:
+      return cutlass::layout::ColumnMajorInterleaved<32>::kStrideRank;
     case LayoutTypeID::kRowMajorInterleavedK32:
+      return cutlass::layout::RowMajorInterleaved<32>::kStrideRank;
     case LayoutTypeID::kColumnMajorInterleavedK64:
-    case LayoutTypeID::kRowMajorInterleavedK64: return 1;
+      return cutlass::layout::ColumnMajorInterleaved<64>::kStrideRank;
+    case LayoutTypeID::kRowMajorInterleavedK64:
+      return cutlass::layout::RowMajorInterleaved<64>::kStrideRank;
     case LayoutTypeID::kTensorNCHW:
-    case LayoutTypeID::kTensorNHWC: return 3;
-    case LayoutTypeID::kTensorNDHWC: return 4;
-    default : throw std::runtime_error("Unsupported LayoutTypeID in LayoutType::get_stride_rank");
+      return cutlass::layout::TensorNCHW::kStrideRank;
+    case LayoutTypeID::kTensorNHWC:
+      return cutlass::layout::TensorNHWC::kStrideRank;
+    case LayoutTypeID::kTensorNDHWC:
+      return cutlass::layout::TensorNDHWC::kStrideRank;
+    case LayoutTypeID::kTensorNC32HW32:
+      return cutlass::layout::TensorNCxHWx<32>::kStrideRank;
+    case LayoutTypeID::kTensorNC64HW64:
+      return cutlass::layout::TensorNCxHWx<64>::kStrideRank;
+    case LayoutTypeID::kTensorC32RSK32:
+      return cutlass::layout::TensorCxRSKx<32>::kStrideRank;
+    case LayoutTypeID::kTensorC64RSK64:
+      return cutlass::layout::TensorCxRSKx<64>::kStrideRank;
+    default:
+      throw std::runtime_error("Unsupported LayoutTypeID in LayoutType::get_stride_rank");
   }
 }
 
@@ -624,6 +656,136 @@ SplitKMode from_string<SplitKMode>(std::string const &str) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+static struct {
+  char const *text;
+  char const *pretty;
+  ConvModeID enumerant;
+}
+ConvModeID_enumerants[] = {
+  {"cross", "<cross>", ConvModeID::kCrossCorrelation},
+  {"conv", "<conv>", ConvModeID::kConvolution},
+};
+
+/// Converts a ConvModeID enumerant to a string
+char const *to_string(ConvModeID type, bool pretty) {
+
+  for (auto const & possible : ConvModeID_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+/// Converts a ConvModeID enumerant from a string
+template <>
+ConvModeID from_string<ConvModeID>(std::string const &str) {
+
+  for (auto const & possible : ConvModeID_enumerants) {
+    if ((str.compare(possible.text) == 0) ||
+        (str.compare(possible.pretty) == 0)) {
+      return possible.enumerant;
+    }
+  }
+
+  return ConvModeID::kInvalid;
+}
+
+
+static struct {
+  char const *text;
+  char const *pretty;
+  IteratorAlgorithmID enumerant;
+}
+IteratorAlgorithmID_enumerants[] = {
+  {"none", "<none>", IteratorAlgorithmID::kNone},
+  {"analytic", "<analytic>", IteratorAlgorithmID::kAnalytic},
+  {"optimized", "<optimized>", IteratorAlgorithmID::kOptimized},
+};
+
+/// Converts a ConvModeID enumerant to a string
+char const *to_string(IteratorAlgorithmID type, bool pretty) {
+
+  for (auto const & possible : IteratorAlgorithmID_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+/// Converts a ConvModeID enumerant from a string
+template <>
+IteratorAlgorithmID from_string<IteratorAlgorithmID>(std::string const &str) {
+
+  for (auto const & possible : IteratorAlgorithmID_enumerants) {
+    if ((str.compare(possible.text) == 0) ||
+        (str.compare(possible.pretty) == 0)) {
+      return possible.enumerant;
+    }
+  }
+
+  return IteratorAlgorithmID::kInvalid;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static struct {
+  char const *text;
+  char const *pretty;
+  ConvKind enumerant;
+}
+ConvKind_enumerants[] = {
+  {"unknown", "<unkown>", ConvKind::kUnknown},
+  {"fprop", "<fprop>", ConvKind::kFprop},
+  {"dgrad", "<dgrad>", ConvKind::kDgrad},
+  {"wgrad", "<wgrad>", ConvKind::kWgrad},
+};
+
+/// Converts a ConvKind enumerant to a string
+char const *to_string(ConvKind type, bool pretty) {
+
+  for (auto const & possible : ConvKind_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+
+/// Converts a ConvKind enumerant from a string
+template <>
+ConvKind from_string<ConvKind>(std::string const &str) {
+
+  for (auto const & possible : ConvKind_enumerants) {
+    if ((str.compare(possible.text) == 0) ||
+        (str.compare(possible.pretty) == 0)) {
+      return possible.enumerant;
+    }
+  }
+
+  return ConvKind::kInvalid;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Lexical cast a string to a byte array. Returns true if cast is successful or false if invalid.
 bool lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type, std::string const &str) {
   int size_bytes = sizeof_bits(type) / 8;
@@ -1224,5 +1386,3 @@ bool cast_from_double(std::vector<uint8_t> &bytes, NumericTypeID type, double sr
 } // namespace cutlass
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt
index 52baacb1aa..3ac944a9f2 100644
--- a/tools/profiler/CMakeLists.txt
+++ b/tools/profiler/CMakeLists.txt
@@ -34,9 +34,12 @@ set(CUTLASS_TOOLS_PROFILER_SOURCES
   src/device_allocation.cu
   src/device_context.cu
   src/cublas_helpers.cpp             
+  src/cudnn_helpers.cpp                   
   src/problem_space.cpp
   src/operation_profiler.cu
   src/gemm_operation_profiler.cu
+  src/conv2d_operation_profiler.cu          
+  src/conv3d_operation_profiler.cu          
   src/sparse_gemm_operation_profiler.cu
 )
 
@@ -58,7 +61,7 @@ set_target_properties(cutlass_profiler PROPERTIES EXPORT_NAME profiler)
 target_include_directories(
   cutlass_profiler
   PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/src                         # Source directory
+  ${CMAKE_CURRENT_LIST_DIR}/src
   )
 
 #
@@ -71,6 +74,7 @@ target_link_libraries(
   cutlass_lib
   cutlass_tools_util_includes
   $<$<BOOL:${CUTLASS_ENABLE_CUBLAS}>:nvidia::cublas>
+  $<$<BOOL:${CUTLASS_ENABLE_CUDNN}>:nvidia::cudnn>
   cudart
   )
 
@@ -79,3 +83,16 @@ install(
   EXPORT NvidiaCutlass
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
   )
+
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM   --operation=Gemm       --providers=cutlass --verification-providers=cublas,device      --junit-output=test_cutlass_profiler_gemm)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D --operation=Conv2d     --providers=cutlass --verification-providers=cudnn,device       --junit-output=test_cutlass_profiler_conv2d)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D --operation=Conv3d     --providers=cutlass --verification-providers=cudnn,device,host  --junit-output=test_cutlass_profiler_conv3d)
+cutlass_add_executable_tests(
+  test_profiler cutlass_profiler
+  DEPENDEES test_all
+  TEST_COMMAND_OPTIONS 
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D
+  DISABLE_EXECUTABLE_INSTALL_RULE
+  )
diff --git a/tools/profiler/src/conv2d_operation_profiler.cu b/tools/profiler/src/conv2d_operation_profiler.cu
new file mode 100644
index 0000000000..4b91535719
--- /dev/null
+++ b/tools/profiler/src/conv2d_operation_profiler.cu
@@ -0,0 +1,1468 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Convolution 2D profiling
+*/
+
+#include <iostream>
+#include <stdexcept>
+#include <iomanip>
+#include <ios>
+
+#include "cutlass/core_io.h"
+
+#include "conv2d_operation_profiler.h"
+#include "gpu_timer.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+using namespace cutlass::library;
+
+namespace cutlass {
+namespace profiler {
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Ctor
+Conv2dOperationProfiler::Conv2dOperationProfiler(Options const &options): 
+  OperationProfiler(
+    options,
+    library::OperationKind::kConv2d, 
+    {
+      {ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"},
+      {ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv2d problem space"},
+      {ArgumentTypeID::kInteger, {"h", "input_h"}, "Input H dimension of the Conv2d problem space"},
+      {ArgumentTypeID::kInteger, {"w", "input_w"}, "Input W dimension of the Conv2d problem space"},
+      {ArgumentTypeID::kInteger, {"c", "input_c"}, "Input C dimension of the Conv2d problem space"},
+      {ArgumentTypeID::kInteger, {"k", "filter_k"}, "Filter K dimension of the Conv2d problem space"},
+      {ArgumentTypeID::kInteger, {"r", "filter_r"}, "Filter R dimension of the Conv2d problem space"},
+      {ArgumentTypeID::kInteger, {"s", "filter_s"}, "Filter S dimension of the Conv2d problem space"},
+      {ArgumentTypeID::kInteger, {"p", "output_p"}, "Output P dimension of the Conv2d problem space"},
+      {ArgumentTypeID::kInteger, {"q", "output_q"}, "Output Q dimension of the Conv2d problem space"},
+      {ArgumentTypeID::kInteger, {"pad_h"}, "Padding in H direction"},
+      {ArgumentTypeID::kInteger, {"pad_w"}, "Padding in W direction"},
+      {ArgumentTypeID::kInteger, {"stride_h"}, "Stride in H direction"},
+      {ArgumentTypeID::kInteger, {"stride_w"}, "Stride in W direction"},
+      {ArgumentTypeID::kInteger, {"dilation_h"}, "Dilation in H direction"},
+      {ArgumentTypeID::kInteger, {"dilation_w"}, "Dilation in W direction"},
+      {ArgumentTypeID::kTensor, {"Activation"}, "Tensor storing the Activation operand"},
+      {ArgumentTypeID::kTensor, {"Filter"}, "Tensor storing the Filter operand"},
+      {ArgumentTypeID::kTensor, {"Output"}, "Tensor storing the Output operand"},
+      {ArgumentTypeID::kEnumerated, {"conv_mode"}, "Convolution filter mode (conv, cross)"},
+      {ArgumentTypeID::kEnumerated, {"iterator_algorithm", "iterator_algo"}, "Convolution iterator algorithm (analytic, optimized)"},
+      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
+      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
+      {ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "SplitK mode for serial or parallel reduction (serial, parallel)"},
+      {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
+      {ArgumentTypeID::kEnumerated, {"eq_gemm_provider", "eq-gemm-provider"}, "Enable profiling equivalent gemm by the following providers (cutlass)"},
+    },
+    { library::Provider::kReferenceDevice, library::Provider::kReferenceHost, library::Provider::kCUDNN }
+  ) {
+
+  description_ = "      Conv2d operation. Output(Tensor4D) = alpha * Input(Tensor4D) * Filter(Tensor4D) + beta * Input(Tensor4D)";
+
+}
+
+/// Destructor
+Conv2dOperationProfiler::~Conv2dOperationProfiler() {
+
+}
+
+
+/// Prints usage statement for the math function
+void Conv2dOperationProfiler::print_usage(std::ostream &out) const {
+  out << "Conv2d" << "\n\n";
+
+  OperationProfiler::print_usage(out);
+}
+
+/// Prints examples
+void Conv2dOperationProfiler::print_examples(std::ostream &out) const {
+
+  out << "\nExamples:\n\n"
+      << "Profile a particular convolution (specify all the convolution parameters):\n"
+      << " $ cutlass_profiler --operation=Conv2d"
+            " --Activation=f16:nhwc --Filter=f16:nhwc --Output=f16 --accumulator-type=f32"
+            " --n=32 --h=14 --w=14 --c=8 --k=64 --r=3 --s=3"
+            " --pad_h=1 --pad_w=1"
+            " --stride::h=1 --stride::w=1"
+            " --dilation::h=1 --dilation::w=1\n\n";
+}
+
+#if 0
+// used this for debugging
+static std::string byte_string(std::vector<uint8_t> const &bytes) {
+  std::stringstream ss;
+
+  ss << "0x";
+
+  for (size_t idx = bytes.size(); idx > 0; --idx) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << uint32_t(bytes.at(idx - 1));
+  }
+
+  return ss.str();
+}
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Total number of bytes loaded
+int64_t Conv2dOperationProfiler::Conv2dProblem::bytes(
+  library::ConvDescription const &operation_desc) const {
+
+  cutlass::gemm::GemmCoord mnk = eq_gemm_size(operation_desc.conv_kind);
+
+ // Input bytes read and Output bytes written for the gemm problem
+  int64_t bytes_ =
+    int64_t(library::sizeof_bits(operation_desc.A.element) * mnk.m() / 8) * mnk.k() +
+    int64_t(library::sizeof_bits(operation_desc.B.element) * mnk.n() / 8) * mnk.k() +
+    int64_t(library::sizeof_bits(operation_desc.C.element) * mnk.m() / 8) * mnk.n();
+
+  // Set is_beta_zero true if beta is zero
+  bool is_beta_zero = std::all_of(beta.begin(), beta.end(), [](uint8_t i) { return i==0; });
+
+  // Output bytes read for the gemm problem for non-zero beta values
+  if (!is_beta_zero) {
+    bytes_ += int64_t(library::sizeof_bits(operation_desc.C.element) * mnk.m() / 8) * mnk.n();
+  }
+
+  return bytes_;
+}
+
+/// Total number of flops computed
+int64_t Conv2dOperationProfiler::Conv2dProblem::flops(
+  library::ConvDescription const &operation_desc) const {
+
+  cutlass::gemm::GemmCoord mnk = eq_gemm_size(operation_desc.conv_kind);
+
+  int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2;
+  int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2;
+  
+  // Adjust mainloop flop for dgrad strided
+  if (operation_desc.conv_kind == library::ConvKind::kDgrad) {
+    flops_mainloop_ = flops_mainloop_ / (stride_h * stride_w);
+  }
+  int64_t flops_total_ = flops_mainloop_ + flops_epilogue_;
+  
+  //complex-valued support
+  switch (operation_desc.tile_description.math_instruction.math_operation) {
+  case library::MathOperationID::kMultiplyAddComplex:
+    flops_total_ *=4;
+    break;
+
+  default: break;
+  }
+
+  return flops_total_;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Extracts the problem dimensions
+Status Conv2dOperationProfiler::initialize_configuration(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  library::ConvDescription const &operation_desc = 
+    static_cast<library::ConvDescription const &>(operation->description());
+
+  if (!arg_as_int(problem_.n, "n", problem_space, problem)) {
+    // default value
+    problem_.n = 1;
+  }
+
+  if (!arg_as_int(problem_.h, "h", problem_space, problem)) {
+    // default value
+    problem_.h = 16;
+  }
+  
+  if (!arg_as_int(problem_.w, "w", problem_space, problem)) {
+    // default value
+    problem_.w = 16;
+  }
+
+  if (!arg_as_int(problem_.c, "c", problem_space, problem)) {
+    // default value
+    problem_.c = 64;
+  }
+
+  if (!arg_as_int(problem_.k, "k", problem_space, problem)) {
+    // default value
+    problem_.k = 64;
+  }
+
+  if (!arg_as_int(problem_.r, "r", problem_space, problem)) {
+    // default value
+    problem_.r = 3;
+  }
+  
+  if (!arg_as_int(problem_.s, "s", problem_space, problem)) {
+    // default value
+    problem_.s = 3;
+  }
+
+  if (!arg_as_int(problem_.pad_h, "pad_h", problem_space, problem)) {
+    // default value
+    problem_.pad_h = 1;
+  }
+
+  if (!arg_as_int(problem_.pad_w, "pad_w", problem_space, problem)) {
+    // default value
+    problem_.pad_w = 1;
+  }
+
+  if (!arg_as_int(problem_.stride_h, "stride_h", problem_space, problem)) {
+    // default value
+    problem_.stride_h = 1;
+  }
+
+  if (!arg_as_int(problem_.stride_w, "stride_w", problem_space, problem)) {
+    // default value
+    problem_.stride_w = 1;
+  }
+
+  if (!arg_as_int(problem_.dilation_h, "dilation_h", problem_space, problem)) {
+    // default value
+    problem_.dilation_h = 1;
+  }
+
+  if (!arg_as_int(problem_.dilation_w, "dilation_w", problem_space, problem)) {
+    // default value
+    problem_.dilation_w = 1;
+  }
+
+  ////////////////////////  Convolution output dimensions p and q ////////////////////////
+  // Cutlass convolutions support arbitrary output sizes and not constriant by          //
+  // input, filter, padding, striding, dilation sizes.                                  //
+  // cuDNN sets the output dimensions (p, q)  using following equations:                //
+  //                                                                                    //
+  // output = div_up(input + 2 * pad - ((filter - 1) * dilation + 1) + 1, stride)       //
+  // where; div_up(a, b) : (a - 1)/b + 1                                                //
+  //                                                                                    //
+  // Thus, when output p and q dimensions are unspecified by the user                   //
+  // cutlass profiler sets p and q which are cuDNN compliant.                           //
+  //                                                                                    //
+  ////////////////////////////////////////////////////////////////////////////////////////
+  // set convolution output p 
+  if (!arg_as_int(problem_.p, "p", problem_space, problem)) {
+    // default value (set using cudnn formula for output height, when p is not provided)
+    problem_.p = (
+                    problem_.h + 
+                    2 * problem_.pad_h - 
+                    ((problem_.r - 1) * problem_.dilation_h + 1)
+                 ) / (problem_.stride_h) 
+                + 1;
+  }
+
+  // set convolution output q
+  if (!arg_as_int(problem_.q, "q", problem_space, problem)) {
+    // default value (set using cudnn formula for output width, when q is not provided)
+    problem_.q = (
+                    problem_.w + 
+                    2 * problem_.pad_w - 
+                    ((problem_.s - 1) * problem_.dilation_w + 1)
+                 ) / (problem_.stride_w) 
+                + 1;
+  }
+  /////////////////////////////////////////////////////////////////////////////////////////
+
+
+  if (!arg_as_SplitKModeID(problem_.split_k_mode, "split_k_mode", problem_space, problem)) {
+    // default value
+    problem_.split_k_mode = library::SplitKMode::kSerial;
+  }
+
+  if (!arg_as_int(problem_.split_k_slices, "split_k_slices", problem_space, problem)) {
+    // default value
+    problem_.split_k_slices = 1;
+  }
+  
+  if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) {
+    // default value
+    problem_.conv_mode = library::ConvModeID::kCrossCorrelation;
+  }
+
+  if (!arg_as_ProviderID(problem_.eq_gemm_provider, "eq_gemm_provider", problem_space, problem)) {
+    // default value
+    problem_.eq_gemm_provider = library::Provider::kNone;
+  }
+
+  if (!conv_kind_satisfies(operation_desc.conv_kind, "conv_kind", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!iterator_algorithm_satisfies(operation_desc.iterator_algorithm, "iterator_algorithm", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.activation(), "Activation", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.filter(), "Filter", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.output(), "Output", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!arg_as_scalar(
+    problem_.alpha, 
+    operation_desc.element_epilogue, 
+    "alpha", 
+    problem_space, 
+    problem)) {
+
+    if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) {
+      return Status::kErrorInternal;
+    }
+  }
+  
+  if (!arg_as_scalar(
+    problem_.beta, 
+    operation_desc.element_epilogue, 
+    "beta", 
+    problem_space, 
+    problem)) {
+    
+    if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) {
+      return Status::kErrorInternal;
+    }
+  }
+
+  // initialize library::Conv2dConfiguration
+  conv_workspace_.configuration.problem_size = conv::Conv2dProblemSize(
+                                                int(problem_.n),
+                                                int(problem_.h),
+                                                int(problem_.w),
+                                                int(problem_.c),
+                                                int(problem_.k),
+                                                int(problem_.r),
+                                                int(problem_.s),
+                                                int(problem_.p),
+                                                int(problem_.q),
+                                                int(problem_.pad_h),
+                                                int(problem_.pad_w),
+                                                int(problem_.stride_h),
+                                                int(problem_.stride_w),
+                                                int(problem_.dilation_h),
+                                                int(problem_.dilation_w),
+                                                static_cast<conv::Mode>(static_cast<int>(problem_.conv_mode)),
+                                                int(problem_.split_k_slices),
+                                                1 // groups
+                                              );
+  
+  conv_workspace_.configuration.split_k_mode = static_cast<conv::SplitKMode>(static_cast<int>(problem_.split_k_mode));
+
+  conv_workspace_.configuration.layout_activations.stride() = make_Coord(
+    int(problem_.c), 
+    int(problem_.w) * int(problem_.c),
+    int(problem_.h) * int(problem_.w) * int(problem_.c)
+  );
+
+  conv_workspace_.configuration.layout_filters.stride() = make_Coord(
+    int(problem_.c), 
+    int(problem_.s) * int(problem_.c),
+    int(problem_.r) * int(problem_.s) * int(problem_.c)
+  );
+
+  conv_workspace_.configuration.layout_output.stride() = make_Coord(
+    int(problem_.k), 
+    int(problem_.q) * int(problem_.k),
+    int(problem_.q) * int(problem_.p) * int(problem_.k)
+  );
+
+
+  // initialize library::ConvArguments
+  conv_workspace_.arguments.A            = nullptr;
+  conv_workspace_.arguments.B            = nullptr;
+  conv_workspace_.arguments.C            = nullptr;
+  conv_workspace_.arguments.D            = nullptr;
+  conv_workspace_.arguments.alpha        = problem_.alpha.data();
+  conv_workspace_.arguments.beta         = problem_.beta.data();
+  conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  // initialize reduction operation for parallel splitKMode
+  if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    if(!initialize_reduction_configuration_(options, report, device_context, operation, problem_space, problem)) {
+      return Status::kErrorInternal;
+    }
+  }
+
+  initialize_result_(this->model_result_, options, operation_desc, problem_space);
+
+  return operation->can_implement(&conv_workspace_.configuration, &conv_workspace_.arguments);
+}
+
+/// Initializes the performance result
+void Conv2dOperationProfiler::initialize_result_(
+  PerformanceResult &result,
+  Options const &options,  
+  library::ConvDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.provider = library::Provider::kCUTLASS;
+  result.disposition = Disposition::kNotRun;
+  result.status = Status::kSuccess;
+  result.operation_name = operation_desc.name;
+
+  result.arguments.resize(problem_space.rank());
+
+  set_argument(result, "Activation", problem_space,
+    std::string(library::to_string(operation_desc.activation().element)) 
+    + ":" + library::to_string(operation_desc.activation().layout));
+
+  set_argument(result, "Filter", problem_space,
+    std::string(library::to_string(operation_desc.filter().element)) 
+    + ":" + library::to_string(operation_desc.filter().layout));
+
+  set_argument(result, "Output", problem_space,
+    std::string(library::to_string(operation_desc.output().element)) 
+    + ":" + library::to_string(operation_desc.output().layout));
+
+  set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind));
+
+  set_argument(result, "iterator_algorithm", problem_space, std::string(library::to_string(operation_desc.iterator_algorithm)));
+
+  set_argument(result, "n", problem_space, problem_.n);
+  set_argument(result, "h", problem_space, problem_.h);
+  set_argument(result, "w", problem_space, problem_.w);
+  set_argument(result, "c", problem_space, problem_.c);
+
+  set_argument(result, "k", problem_space, problem_.k);
+  set_argument(result, "r", problem_space, problem_.r);
+  set_argument(result, "s", problem_space, problem_.s);
+  
+  set_argument(result, "p", problem_space, problem_.p);
+  set_argument(result, "q", problem_space, problem_.q);
+
+  set_argument(result, "pad_h", problem_space, problem_.pad_h);
+  set_argument(result, "pad_w", problem_space, problem_.pad_w);
+
+  set_argument(result, "stride_h", problem_space, problem_.stride_h);
+  set_argument(result, "stride_w", problem_space, problem_.stride_w);
+
+  set_argument(result, "dilation_h", problem_space, problem_.dilation_h);
+  set_argument(result, "dilation_w", problem_space, problem_.dilation_w);
+
+  set_argument(result, "split_k_mode", problem_space, 
+    std::string(library::to_string(problem_.split_k_mode)));
+  set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices);
+
+  set_argument(result, "conv_mode", problem_space, 
+    std::string(library::to_string(problem_.conv_mode)));
+
+  set_argument(result, "alpha", problem_space,
+    library::lexical_cast(problem_.alpha, operation_desc.element_epilogue));
+
+  set_argument(result, "beta", problem_space,
+    library::lexical_cast(problem_.beta, operation_desc.element_epilogue));
+
+  set_argument(result, "eq_gemm_provider", problem_space, 
+    std::string(library::to_string(problem_.eq_gemm_provider)));
+
+  OperationProfiler::initialize_result_(result, operation_desc, problem_space);
+
+  // Bytes of activation, filter, and output tensors
+  int64_t activation_bytes = int64_t(library::sizeof_bits(operation_desc.activation().element) / 8) * 
+    conv_workspace_.configuration.problem_size.activation_size();
+
+  int64_t filter_bytes = int64_t(library::sizeof_bits(operation_desc.filter().element) / 8) * 
+    conv_workspace_.configuration.problem_size.filter_size();
+
+  int64_t output_bytes = int64_t(library::sizeof_bits(operation_desc.output().element) / 8) * 
+    conv_workspace_.configuration.problem_size.output_size();
+
+  // Bytes of activation, filter, and output tensors
+  result.bytes = problem_.bytes(operation_desc);
+
+  // Theoritical flops required for the computation
+  result.flops = problem_.flops(operation_desc);
+
+  // Measured runtime
+  result.runtime = 0;
+
+}
+
+/// Initialize reduction problem dimenstions and library::Operation
+bool Conv2dOperationProfiler::initialize_reduction_configuration_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  library::ConvDescription const &conv_desc = 
+    static_cast<library::ConvDescription const &>(operation->description());
+
+  library::ConvKind const &conv_kind = conv_desc.conv_kind;
+
+  if (!cast_from_double(problem_.alpha_one, conv_desc.element_epilogue, 1)) {
+   return false;
+  }
+
+  if (!cast_from_double(problem_.beta_zero, conv_desc.element_epilogue, 0)) {
+   return false;
+  }
+
+  /// This chooses the appropriate stride element of the row-major C tensor.
+  int const & tensor_c_stride_idx = (conv_kind == library::ConvKind::kWgrad ? 2 : 0);
+
+  /// intialize library::ReductionConfiguration
+  conv_workspace_.reduction_configuration.problem_size     = problem_.eq_gemm_size(conv_kind).mn();
+  conv_workspace_.reduction_configuration.partitions       = int(problem_.split_k_slices);
+  conv_workspace_.reduction_configuration.partition_stride = problem_.eq_gemm_size(conv_kind).mn().product();
+  conv_workspace_.reduction_configuration.ldw              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
+  conv_workspace_.reduction_configuration.lds              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
+  conv_workspace_.reduction_configuration.ldd              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
+
+  // find reduction operation 
+  library::ReductionFunctionalKey reduction_key(
+    library::Provider::kCUTLASS,
+    conv_desc.tile_description.math_instruction.element_accumulator,  // element workspace 
+    conv_desc.tile_description.math_instruction.element_accumulator,  // element accumulator
+    conv_desc.C.element,                                              // element output
+    conv_desc.element_epilogue                                        // element compute
+  ); 
+
+#if 0// debug print to check which reduction instance is selected
+    std::cout << reduction_key << "\n";
+#endif
+  auto reduction_it = Singleton::get().operation_table.reduction_operations.find(reduction_key);
+
+  if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) {
+
+    return false;
+  }    
+
+  // initialize reduction operation required for parallel split-k conv2d operator
+  reduction_op_ = reduction_it->second;
+
+  // reduction operation found and initialized
+  return true;
+}
+
+
+/// Initializes workspace
+Status Conv2dOperationProfiler::initialize_workspace(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  // initialize conv2d underlying operation to handle parallel reduction
+  library::Operation const* underlying_operation = operation;
+
+  if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) {
+      return Status::kErrorNotSupported;
+    }
+  }
+
+  library::ConvDescription const &operation_desc = 
+    static_cast<library::ConvDescription const &>(underlying_operation->description());
+
+  // Compute the number of copies of the problem to avoid L2 camping.
+  if (!options.profiling.workspace_count) {
+    int64_t bytes = problem_.bytes(operation_desc);
+    if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
+      conv_workspace_.problem_count =
+        1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
+    }
+    else {
+      conv_workspace_.problem_count = 1;
+    }
+  }
+  else {
+    conv_workspace_.problem_count = options.profiling.workspace_count;
+  }
+
+
+  if (options.execution_mode != ExecutionMode::kDryRun) {
+
+    conv_workspace_.A = device_context.allocate_tensor(
+      options,
+      "A",
+      operation_desc.A.element,
+      operation_desc.A.layout,
+      problem_.extent_a(operation_desc.conv_kind),
+      conv_workspace_.stride_a(operation_desc.conv_kind),
+      conv_workspace_.problem_count
+    );
+
+    conv_workspace_.B = device_context.allocate_tensor(
+      options,
+      "B",
+      operation_desc.B.element,
+      operation_desc.B.layout,
+      problem_.extent_b(operation_desc.conv_kind),
+      conv_workspace_.stride_b(operation_desc.conv_kind),
+      conv_workspace_.problem_count
+    );
+
+    conv_workspace_.C = device_context.allocate_tensor(
+      options,
+      "C",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      problem_.extent_c(operation_desc.conv_kind),
+      conv_workspace_.stride_c(operation_desc.conv_kind),
+      conv_workspace_.problem_count
+    );
+
+    conv_workspace_.Computed = device_context.allocate_tensor(
+      "D",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      problem_.extent_c(operation_desc.conv_kind),
+      conv_workspace_.stride_c(operation_desc.conv_kind),
+      conv_workspace_.problem_count
+    );
+
+    conv_workspace_.Reference = device_context.allocate_tensor(
+      "Reference",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      problem_.extent_c(operation_desc.conv_kind),
+      conv_workspace_.stride_c(operation_desc.conv_kind),
+      conv_workspace_.problem_count
+    );
+    
+  }
+
+  //
+  // Initialize the CUTLASS operation
+  //
+  Status status = Status::kSuccess;
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    if (options.execution_mode != ExecutionMode::kDryRun) {
+
+      uint64_t workspace_size = underlying_operation->get_host_workspace_size(&conv_workspace_.configuration);
+      conv_workspace_.host_workspace.resize(workspace_size, 0);
+
+      workspace_size = underlying_operation->get_device_workspace_size(&conv_workspace_.configuration);
+      conv_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
+
+      status = underlying_operation->initialize(
+        &conv_workspace_.configuration,
+        conv_workspace_.host_workspace.data(),
+        conv_workspace_.device_workspace.data());
+
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+        workspace_size = reduction_op_->get_host_workspace_size(&conv_workspace_.reduction_configuration);
+        conv_workspace_.reduction_host_workspace.resize(workspace_size, 0);
+
+        status = reduction_op_->initialize(
+          &conv_workspace_.reduction_configuration, 
+          conv_workspace_.reduction_host_workspace.data(), 
+          nullptr);
+        
+        if (status != Status::kSuccess) {
+          return status;
+        }
+      }
+    }
+
+    //
+    // If CUTLASS is enabled, generate a result for it
+    //
+    results_.push_back(model_result_);
+    results_.back().provider = library::Provider::kCUTLASS;
+    results_.back().op_kind = library::OperationKind::kConv2d;
+    results_.back().disposition = Disposition::kNotRun;
+
+    for(auto provider : verification_providers_) {
+      results_.back().verification_map[provider] = Disposition::kNotRun;
+    }
+  }
+
+  return status;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool Conv2dOperationProfiler::verify_cutlass(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (!options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+    return true;
+  }
+
+  if (options.execution_mode == ExecutionMode::kDryRun) {
+    return true;
+  }
+
+  cudaError_t result;
+
+  // Initialize structure containing Conv2d arguments
+  conv_workspace_.arguments.A = conv_workspace_.A->data();
+  conv_workspace_.arguments.B = conv_workspace_.B->data();
+  conv_workspace_.arguments.C = conv_workspace_.C->data();
+  conv_workspace_.arguments.D = conv_workspace_.Computed->data();
+  conv_workspace_.arguments.alpha = problem_.alpha.data();
+  conv_workspace_.arguments.beta = problem_.beta.data();
+  conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data());
+  
+  if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    // update library::ConvArguments for parallel split-k reduction
+    conv_workspace_.arguments.D = conv_workspace_.device_workspace.data();
+    conv_workspace_.arguments.alpha = problem_.alpha_one.data();
+    conv_workspace_.arguments.beta = problem_.beta_zero.data();
+
+    /// intialize library::ReductionArguments
+    conv_workspace_.reduction_arguments.workspace           = conv_workspace_.device_workspace.data();
+    conv_workspace_.reduction_arguments.source              = conv_workspace_.C->data();
+    conv_workspace_.reduction_arguments.destination         = conv_workspace_.Computed->data();
+    conv_workspace_.reduction_arguments.alpha               = problem_.alpha.data();
+    conv_workspace_.reduction_arguments.beta                = problem_.beta.data();
+    conv_workspace_.reduction_arguments.pointer_mode        = library::ScalarPointerMode::kHost;
+  }
+
+  //
+  // Run the CUTLASS operation
+  //
+  // initialize conv2d underlying operation to handle parallel reduction
+  library::Operation const* underlying_operation = operation;
+
+  if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) {
+      results_.back().disposition = Disposition::kFailed;
+      return false;
+    }
+  }
+
+#if 0
+  std::cout << "profiling         : " << std::endl 
+            << "conv2d            : " << operation->description().name << std::endl 
+            << "underlying conv2d : " << underlying_operation->description().name << std::endl 
+            << "reduction         : " << reduction_op_->description().name << std::endl;
+#endif
+
+  // run cutlass conv2d operation
+  results_.back().status = underlying_operation->run(
+    &conv_workspace_.arguments,
+    conv_workspace_.host_workspace.data(),
+    conv_workspace_.device_workspace.data());
+
+  if (results_.back().status != Status::kSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  // Run parallel reduction kernel for parallel split_k_mode
+  if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    
+    results_.back().status = reduction_op_->run(
+      &conv_workspace_.reduction_arguments,
+      conv_workspace_.reduction_host_workspace.data(),
+      nullptr);
+
+    if (results_.back().status != Status::kSuccess) {
+      results_.back().disposition = Disposition::kFailed;
+      return false;
+    }
+
+  }
+
+  // Synchronize before running device reference
+  result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  // CUTLASS op ran the but not yet verified against any verification provider
+  results_.back().disposition = Disposition::kNotVerified;
+  
+  //
+  // Run verification providers
+  //
+
+  if (options.verification.enabled) {
+
+#if CUTLASS_ENABLE_CUDNN
+    // Run verification cudnn reference
+    if (options.verification.provider_enabled(library::Provider::kCUDNN)) {
+
+      // Guard against unsupported cases
+      auto const & conv_desc = static_cast<library::ConvDescription const &>(operation->description());
+
+      Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration);
+
+      // Initialize reference data to the source data 
+      conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
+
+      if (status == Status::kSuccess) {
+        // call cudnn verification if supported
+        verify_with_cudnn_(
+          options,
+          report,
+          device_context,
+          operation,
+          problem_space,
+          problem);
+      }
+
+      else if (status == Status::kErrorInvalidProblem) {
+        results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kInvalidProblem;
+      }
+
+      else {
+        // set verification map for cudnn to not supported
+        results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kNotSupported;
+      }
+    }
+#endif // #if CUTLASS_ENABLE_CUDNN
+
+    // Run verification device reference
+    if (options.verification.provider_enabled(library::Provider::kReferenceDevice)) {
+
+      // Restore reference data back to initial source data 
+      conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
+
+      verify_with_device_reference_(
+        options,
+        report,
+        device_context,
+        operation,
+        problem_space,
+        problem);      
+    }
+
+    // Run verification host reference
+    if (options.verification.provider_enabled(library::Provider::kReferenceHost)) {
+      
+      // Restore reference data back to initial source data 
+      conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
+
+      verify_with_host_reference_(
+        options,
+        report,
+        device_context,
+        operation,
+        problem_space,
+        problem);      
+    }
+
+    // Update disposition to worst case verification outcome among all 
+    // verification providers which are supported
+    bool is_any_verification_run_passed = false;
+    for(auto &m : results_.back().verification_map) {
+      if(m.second == Disposition::kFailed || m.second == Disposition::kIncorrect) {
+        results_.back().disposition = m.second;
+        return true;
+      }
+      if(!is_any_verification_run_passed && m.second == Disposition::kPassed) {
+        is_any_verification_run_passed = true;
+      }
+    }
+
+    if(is_any_verification_run_passed) {
+      results_.back().disposition = Disposition::kPassed;
+    }
+  }
+
+  // Return true means continue profiling
+  return true;
+}
+
+
+/// Verifies CUTLASS against host reference
+bool Conv2dOperationProfiler::verify_with_host_reference_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+    Status status;
+
+    //
+    // Find host reference operation using conv2d functional description key
+    //
+    library::OperationDescription const &desc = operation->description();
+
+    auto &conv_desc = static_cast<library::ConvDescription const &>(desc);
+
+    library::ConvFunctionalKey conv2d_key(
+      library::Provider::kReferenceHost,
+      conv_desc.conv_kind,        
+      conv_desc.A.element,
+      conv_desc.A.layout,
+      conv_desc.B.element,
+      conv_desc.B.layout,
+      conv_desc.C.element,
+      conv_desc.C.layout,
+      conv_desc.tile_description.math_instruction.element_accumulator, 
+      conv_desc.element_epilogue);
+
+#if 0 // debug print to check which host refererence instance is selected
+    std::cout << conv2d_key << "\n";
+#endif
+
+    auto operators_it = Singleton::get().operation_table.conv2d_operations.find(conv2d_key);
+
+    if(operators_it == Singleton::get().operation_table.conv2d_operations.end()) {
+
+      results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
+      return true;
+    }    
+
+    // conv2d host reference minimum cc is 0 (CPU) and no iterator algorithm
+    library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone);
+    auto cc_it = operators_it->second.find(preference_key);
+    
+    if(cc_it == operators_it->second.end()) {
+      results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
+      return true;
+    }
+
+    // host refernce has only one instances in Conv2dOperationVectorMap
+    library::Operation const *reference_op = cc_it->second[0];
+
+    //
+    // Copy input tensors A, B, and C from device to host buffers
+    //
+    conv_workspace_.host_tensor_a.resize(conv_workspace_.A->bytes());
+    conv_workspace_.host_tensor_b.resize(conv_workspace_.B->bytes());
+    conv_workspace_.host_tensor_c.resize(conv_workspace_.C->bytes());
+
+    conv_workspace_.A->copy_to_host(conv_workspace_.host_tensor_a.data());
+    conv_workspace_.B->copy_to_host(conv_workspace_.host_tensor_b.data());
+    conv_workspace_.C->copy_to_host(conv_workspace_.host_tensor_c.data());
+
+    //
+    // Initialize structure containing Conv2d arguments
+    //
+    conv_workspace_.arguments.A = conv_workspace_.host_tensor_a.data();
+    conv_workspace_.arguments.B = conv_workspace_.host_tensor_b.data();
+    conv_workspace_.arguments.C = conv_workspace_.host_tensor_c.data();
+    conv_workspace_.arguments.D = conv_workspace_.host_tensor_c.data();
+
+    conv_workspace_.arguments.alpha = problem_.alpha.data();
+    conv_workspace_.arguments.beta = problem_.beta.data();
+    conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+    //
+    // Intialize host reference operation
+    //
+    std::vector<uint8_t> host_workspace_reference_op;
+
+    uint64_t workspace_size = reference_op->get_host_workspace_size(&conv_workspace_.configuration);
+    host_workspace_reference_op.resize(workspace_size, 0);
+
+    reference_op->initialize(
+      &conv_workspace_.configuration,
+      host_workspace_reference_op.data());
+
+    //
+    // Run host reference operation
+    //
+    status = reference_op->run(
+      &conv_workspace_.arguments,
+      host_workspace_reference_op.data());
+
+    // Handle errors
+    if (status != Status::kSuccess) {
+      results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotVerified;
+      return true;
+    }
+
+    //
+    // Copy host reference output to device memory for equality check on device
+    //
+    conv_workspace_.Reference->copy_from_host(conv_workspace_.arguments.D);
+
+    //
+    // Verify results
+    //
+    results_.back().verification_map[library::Provider::kReferenceHost] = compare_tensors(
+      options,
+      *conv_workspace_.Computed,
+      *conv_workspace_.Reference,
+      conv_workspace_.Computed->batch_stride()
+    );
+
+    // Save workspace if incorrect
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+      results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) {
+  
+      save_workspace(
+        device_context,
+        options,
+        static_cast<library::ConvDescription const &>(operation->description()),
+        library::Provider::kCUTLASS,
+        library::Provider::kReferenceHost);
+    }
+
+  // Return true means continue profiling
+  return true;
+}
+
+
+/// Verifies CUTLASS against host reference
+bool Conv2dOperationProfiler::verify_with_device_reference_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+    Status status;
+
+    //
+    // Find device reference operation using conv2d functional description key
+    //
+    library::OperationDescription const &desc = operation->description();
+
+    auto &conv_desc = static_cast<library::ConvDescription const &>(desc);
+
+    library::ConvFunctionalKey conv2d_key(
+      library::Provider::kReferenceDevice,
+      conv_desc.conv_kind,      
+      conv_desc.A.element,
+      conv_desc.A.layout,
+      conv_desc.B.element,
+      conv_desc.B.layout,
+      conv_desc.C.element,
+      conv_desc.C.layout,
+      conv_desc.tile_description.math_instruction.element_accumulator, 
+      conv_desc.element_epilogue);
+
+    auto operators_it = Singleton::get().operation_table.conv2d_operations.find(conv2d_key);
+
+    if(operators_it == Singleton::get().operation_table.conv2d_operations.end()) {
+
+      results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun;
+
+      return true;
+    }    
+
+    // conv2d device reference minimum cc is 50 and no iterator algorithm
+    library::ConvPreferenceKey preference_key(50, library::IteratorAlgorithmID::kNone);
+    auto cc_it = operators_it->second.find(preference_key);
+    
+    if(cc_it == operators_it->second.end()) {
+      results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun;
+
+      return true;
+    }
+
+    // device refernce has only one instances in Conv2dOperationVectorMap
+    library::Operation const *reference_op = cc_it->second[0];
+  
+    //
+    // Intialize device reference operation
+    //
+    std::vector<uint8_t> host_workspace_reference_op;
+
+    uint64_t workspace_size = reference_op->get_host_workspace_size(&conv_workspace_.configuration);
+    host_workspace_reference_op.resize(workspace_size, 0);
+
+    reference_op->initialize(
+      &conv_workspace_.configuration,
+      host_workspace_reference_op.data());
+
+    // Initialize structure containing Conv2d arguments
+    conv_workspace_.arguments.A = conv_workspace_.A->data();
+    conv_workspace_.arguments.B = conv_workspace_.B->data();
+    conv_workspace_.arguments.C = conv_workspace_.C->data();
+    conv_workspace_.arguments.D = conv_workspace_.Reference->data();
+    conv_workspace_.arguments.alpha = problem_.alpha.data();
+    conv_workspace_.arguments.beta = problem_.beta.data();
+    conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+    //
+    // Run device reference operation
+    //
+    status = reference_op->run(
+      &conv_workspace_.arguments,
+      host_workspace_reference_op.data());
+
+
+    // Handle errors
+    if (status != Status::kSuccess) {
+      results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotVerified;
+      return true;
+    }
+
+    //
+    // Verify results
+    //
+    results_.back().verification_map[library::Provider::kReferenceDevice] = compare_tensors(
+      options,
+      *conv_workspace_.Computed,
+      *conv_workspace_.Reference,
+      conv_workspace_.Computed->batch_stride()
+    );
+
+    // Save workspace if incorrect
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+      results_.back().verification_map[library::Provider::kReferenceDevice] == Disposition::kIncorrect) {
+  
+      save_workspace(
+        device_context,
+        options,
+        static_cast<library::ConvDescription const &>(operation->description()),
+        library::Provider::kCUTLASS,
+        library::Provider::kReferenceDevice);
+    }
+
+  // Return true means continue profiling
+  return true;
+}
+
+/// Measures performance results
+bool Conv2dOperationProfiler::profile(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    // Initialize structure containing Conv2d arguments
+    conv_workspace_.arguments.A = conv_workspace_.A->data();
+    conv_workspace_.arguments.B = conv_workspace_.B->data();
+    conv_workspace_.arguments.C = conv_workspace_.C->data();
+    conv_workspace_.arguments.D = conv_workspace_.Computed->data();
+    conv_workspace_.arguments.alpha = problem_.alpha.data();
+    conv_workspace_.arguments.beta = problem_.beta.data();
+    conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+      // update library::ConvArguments for parallel split-k reduction
+      conv_workspace_.arguments.D = conv_workspace_.device_workspace.data();
+      conv_workspace_.arguments.alpha = problem_.alpha_one.data();
+      conv_workspace_.arguments.beta = problem_.beta_zero.data();
+
+      /// intialize library::ReductionArguments
+      conv_workspace_.reduction_arguments.workspace           = conv_workspace_.device_workspace.data();
+      conv_workspace_.reduction_arguments.source              = conv_workspace_.C->data();
+      conv_workspace_.reduction_arguments.destination         = conv_workspace_.Computed->data();
+      conv_workspace_.reduction_arguments.alpha               = problem_.alpha.data();
+      conv_workspace_.reduction_arguments.beta                = problem_.beta.data();
+      conv_workspace_.reduction_arguments.pointer_mode        = library::ScalarPointerMode::kHost;
+    }
+
+    results_.back().status = profile_cutlass_(
+      results_.back().runtime,
+      options,
+      operation,
+      &conv_workspace_.arguments,
+      conv_workspace_.host_workspace.data(),
+      conv_workspace_.device_workspace.data()
+    );
+  }
+  return true;
+
+}
+
+/// Method to profile a CUTLASS Operation
+Status Conv2dOperationProfiler::profile_cutlass_(
+  double &runtime,
+  Options const &options,
+  library::Operation const *operation,
+  void *arguments,
+  void *host_workspace,
+  void *device_workspace) {
+
+  GpuTimer timer;
+
+  // initialize conv2d underlying operation to handle parallel reduction
+  library::Operation const* underlying_operation = operation; 
+
+  library::ConvArguments *conv_arguments = static_cast<library::ConvArguments *>(arguments);
+
+  if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) {
+      return Status::kErrorNotSupported;
+    }
+  }
+
+  //
+  // Optional sleep to limit power consumption and thermals
+  //
+
+  sleep(options.profiling.sleep_duration);
+
+  //
+  // Warmup loop
+  //
+
+  Status status;
+
+  for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) {
+
+    // Setup rotating workspace
+    int workspace_idx = options.profiling.warmup_iterations + iteration;
+    int problem_idx = (workspace_idx % conv_workspace_.problem_count);
+
+    conv_arguments->A = conv_workspace_.A->batch_data(problem_idx);
+    conv_arguments->B = conv_workspace_.B->batch_data(problem_idx);
+    conv_arguments->C = conv_workspace_.C->batch_data(problem_idx);
+    conv_arguments->D = conv_workspace_.Computed->batch_data(problem_idx);
+    
+    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+      // update library::ConvArguments for parallel split-k reduction
+      conv_arguments->D = conv_workspace_.device_workspace.data();
+
+      /// intialize library::ReductionArguments
+      conv_workspace_.reduction_arguments.workspace           = conv_workspace_.device_workspace.data();
+      conv_workspace_.reduction_arguments.source              = conv_workspace_.C->batch_data(problem_idx);
+      conv_workspace_.reduction_arguments.destination         = conv_workspace_.Computed->batch_data(problem_idx);
+    }
+
+    // Run underlying conv2d operation
+    status = underlying_operation->run(
+      arguments,
+      host_workspace,
+      device_workspace);
+
+    // Run parallel reduction kernel for parallel split_k_mode
+    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+
+      status = reduction_op_->run(
+        &conv_workspace_.reduction_arguments,
+        conv_workspace_.reduction_host_workspace.data(),
+        nullptr);
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+  }
+  
+  //
+  // Initialize GPU timer
+  //
+
+  timer.start();
+
+  //
+  // Profiling loop
+  //
+
+  int Iterations = options.profiling.iterations;
+
+  int iteration = 0;
+  for (; iteration < Iterations; ++iteration) {
+    
+    // Setup rotating workspace
+    int problem_idx = (iteration % conv_workspace_.problem_count);
+
+    conv_arguments->A = conv_workspace_.A->batch_data(problem_idx);
+    conv_arguments->B = conv_workspace_.B->batch_data(problem_idx);
+    conv_arguments->C = conv_workspace_.C->batch_data(problem_idx);
+    conv_arguments->D = conv_workspace_.Computed->batch_data(problem_idx);
+
+    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+      // update library::ConvArguments for parallel split-k reduction
+      conv_arguments->D = conv_workspace_.device_workspace.data();
+
+      /// intialize library::ReductionArguments
+      conv_workspace_.reduction_arguments.workspace           = conv_workspace_.device_workspace.data();
+      conv_workspace_.reduction_arguments.source              = conv_workspace_.C->batch_data(problem_idx);
+      conv_workspace_.reduction_arguments.destination         = conv_workspace_.Computed->batch_data(problem_idx);
+    }
+
+    // Run underlying conv2d operation
+    status = underlying_operation->run(
+      arguments,
+      host_workspace,
+      device_workspace);
+
+    // Run parallel reduction kernel for parallel split_k_mode
+    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {      
+
+      status = reduction_op_->run(
+        &conv_workspace_.reduction_arguments,
+        conv_workspace_.reduction_host_workspace.data(),
+        nullptr);
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+  }
+
+  //
+  // Wait for completion
+  //
+
+  timer.stop_and_wait();
+
+  //
+  // Update performance result
+  //
+  
+  runtime = timer.duration(iteration);
+
+  return status;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#if CUTLASS_ENABLE_CUDNN
+
+/// Verifies CUTLASS against cudnn reference
+bool Conv2dOperationProfiler::verify_with_cudnn_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+  
+  auto &conv_desc = static_cast<library::ConvDescription const &>(operation->description());
+
+  //
+  // Construct cudnn operators
+  //
+
+  CudnnCreate handle;
+  cudnnStatus_t status = handle.get_cudnn_create_status();
+
+  if (status != CUDNN_STATUS_SUCCESS) {
+    
+    results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
+    return true;
+  }
+
+  //
+  // Initialize state
+  //
+
+  // Initialize structure containing Conv2d arguments
+  conv_workspace_.arguments.A = conv_workspace_.A->data();
+  conv_workspace_.arguments.B = conv_workspace_.B->data();
+  conv_workspace_.arguments.D = conv_workspace_.Reference->data();
+  conv_workspace_.arguments.alpha = problem_.alpha.data();
+  conv_workspace_.arguments.beta = problem_.beta.data();
+  conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+      
+  // cuDNN does not support four tensor arguments, so we copy the tensor C data into
+  // tensor D.
+  conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
+  conv_workspace_.arguments.C = conv_workspace_.arguments.D;
+
+  try {
+
+    //
+    // Construct dispatcher to cudnn operator
+    //
+
+    detail::cudnnConvDispatcher conv_op( 
+      conv_desc, 
+      conv_workspace_.configuration,
+      conv_workspace_.arguments,
+      handle
+    );
+
+    if (conv_op.status != Status::kSuccess) {
+      if (conv_op.status == Status::kErrorNotSupported) {
+        results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kNotSupported;
+
+      } else {
+        results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kFailed;
+      }
+      return true;
+    }
+
+
+    status = conv_op(handle);
+
+    // Handle errors
+    if (status != CUDNN_STATUS_SUCCESS) {
+
+      results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
+      return true;
+    }
+
+    //
+    // Verify results
+    //
+
+    results_.back().verification_map[library::Provider::kCUDNN] = compare_tensors(
+      options,
+      *conv_workspace_.Computed,
+      *conv_workspace_.Reference,
+      conv_workspace_.Computed->batch_stride()
+    );
+
+    // Save workspace if incorrect
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+      results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) {
+
+      save_workspace(
+        device_context,
+        options,
+        conv_desc,
+        library::Provider::kCUTLASS,
+        library::Provider::kCUDNN);
+    }
+  }
+  catch (...) {
+    results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kFailed;
+  }
+
+  // Return true means continue profiling
+  return true;
+}
+
+#endif // #if CUTLASS_ENABLE_CUDNN
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/conv2d_operation_profiler.h b/tools/profiler/src/conv2d_operation_profiler.h
new file mode 100644
index 0000000000..40c003e1d4
--- /dev/null
+++ b/tools/profiler/src/conv2d_operation_profiler.h
@@ -0,0 +1,431 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines profiling functionality for convolution
+
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/handle.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/singleton.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+#include "reduction_operation_profiler.h"
+#if CUTLASS_ENABLE_CUDNN
+#include "cudnn_helpers.h"
+#endif //#if CUTLASS_ENABLE_CUDNN
+#include "debug.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class Conv2dOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct Conv2dProblem {
+
+    int64_t n, h, w, c, p, q, k, r, s;
+    int64_t pad_h, pad_w;
+    int64_t stride_h, stride_w;
+    int64_t dilation_h, dilation_w;
+
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+
+    library::SplitKMode split_k_mode;
+    int64_t split_k_slices;
+
+    library::ConvModeID conv_mode;
+
+    library::Provider eq_gemm_provider;
+
+    // convolution with parallel interleaved reduction  
+    // convolution epilogue (alpha, beta) = (1.0, 0.0)
+    // reduction epilogue (alpha, beta) = (Conv2dProblem::alpha, Conv2dProblem::beta)
+    std::vector<uint8_t> alpha_one;
+    std::vector<uint8_t> beta_zero;
+
+    //
+    // Methods
+    //
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::ConvDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::ConvDescription const &operation_desc) const;
+
+    void set_default_output_size() {
+      p = ((h + pad_h - r * dilation_h) / stride_h) + 1;
+      q = ((w + pad_w - s * dilation_w) / stride_w) + 1;
+    }
+
+    // Returns equivalent gemm problem size for convolution
+    cutlass::gemm::GemmCoord eq_gemm_size(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return cutlass::gemm::GemmCoord(int(n * p * q), int(k), int(r * s * c));
+        case library::ConvKind::kDgrad: return cutlass::gemm::GemmCoord(int(n * h * w), int(c), int(k * r * s));
+        case library::ConvKind::kWgrad: return cutlass::gemm::GemmCoord(int(k), int(r * s * c), int(n * p * q));
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor A
+    std::vector<int> extent_a(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(n), int(h), int(w), int(c)};
+        case library::ConvKind::kDgrad: return {int(n), int(p), int(q), int(k)};
+        case library::ConvKind::kWgrad: return {int(n), int(p), int(q), int(k)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor B
+    std::vector<int> extent_b(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(k), int(r), int(s), int(c)};
+        case library::ConvKind::kDgrad: return {int(k), int(r), int(s), int(c)};
+        case library::ConvKind::kWgrad: return {int(n), int(h), int(w), int(c)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor C
+    std::vector<int> extent_c(library::ConvKind const &conv_kind) const {
+    
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(n), int(p), int(q), int(k)};
+        case library::ConvKind::kDgrad: return {int(n), int(h), int(w), int(c)};
+        case library::ConvKind::kWgrad: return {int(k), int(r), int(s), int(c)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix A
+    library::LayoutTypeID eq_gemm_layout_a(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return library::LayoutTypeID::kRowMajor;    // TN Gemm
+        case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor;    // TT Gemm
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor; // NT Gemm
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix B
+    library::LayoutTypeID eq_gemm_layout_b(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return library::LayoutTypeID::kColumnMajor;  // TN Gemm
+        case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor;     // TT Gemm
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kRowMajor;     // NT Gemm
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix C
+    library::LayoutTypeID eq_gemm_layout_c(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        // Gemm operator assumes column-major output
+        case library::ConvKind::kFprop:
+        case library::ConvKind::kDgrad: 
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor;
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimenstion for equivalent gemm matrix A
+    int64_t eq_gemm_lda(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimenstion for equivalent gemm matrix B
+    int64_t eq_gemm_ldb(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).n();
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).n();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimenstion for equivalent gemm matrix C
+    int64_t eq_gemm_ldc(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: 
+        case library::ConvKind::kDgrad: 
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+  };
+
+  /// Workspace used 
+  struct Conv2dWorkspace {
+
+    /// Conv device allocations
+    DeviceAllocation *A;
+    DeviceAllocation *B;
+    DeviceAllocation *C;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+    
+    /// Library configuration and arguments for convolution operator
+    library::Conv2dConfiguration configuration;
+    library::ConvArguments arguments;
+
+    /// Number of copies of the problem workspace which are visited sequentially during
+    /// profiling to avoid camping in the last level cache.
+    int problem_count;
+
+    /// Buffer used for the cutlass conv2d operations' host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the cutlass operations' device workspace
+    DeviceAllocation device_workspace;
+    
+    /// Library configuration and arguments for reduction operator
+    library::ReductionConfiguration reduction_configuration;
+    library::ReductionArguments reduction_arguments;
+
+    /// Buffer used for the cutlass reduction operations' host workspace
+    std::vector<uint8_t> reduction_host_workspace;
+  
+    /// Host data buffers for host reference operation
+    /// host buffer for tensor 
+    std::vector<uint8_t> host_tensor_a;
+
+    /// host buffer for tensor b
+    std::vector<uint8_t> host_tensor_b;
+
+    /// host buffer for tensor c
+    std::vector<uint8_t> host_tensor_c;
+
+
+    //
+    // Methods
+    //
+
+    Conv2dWorkspace(): 
+      A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
+
+      // Returns stride vector for tensor A
+      std::vector<int> stride_a(library::ConvKind const &conv_kind) {
+        return {        
+          configuration.layout_a(conv_kind).stride()[0],
+          configuration.layout_a(conv_kind).stride()[1],
+          configuration.layout_a(conv_kind).stride()[2]
+        };
+      }
+
+      // Returns stride vector for tensor B
+      std::vector<int> stride_b(library::ConvKind const &conv_kind) {
+
+        return {        
+          configuration.layout_b(conv_kind).stride()[0],
+          configuration.layout_b(conv_kind).stride()[1],
+          configuration.layout_b(conv_kind).stride()[2]
+        };
+      }
+
+      // Returns stride vector for tensor C
+      std::vector<int> stride_c(library::ConvKind const &conv_kind) {
+
+        return {        
+          configuration.layout_c(conv_kind).stride()[0],
+          configuration.layout_c(conv_kind).stride()[1],
+          configuration.layout_c(conv_kind).stride()[2]
+        };
+      }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// CONV problem obtained from problem space
+  Conv2dProblem problem_;
+
+  /// Device memory allocations 
+  Conv2dWorkspace conv_workspace_;
+
+  /// CUTLASS parallel reduction operation to follow this* conv2d operation
+  library::Operation const *reduction_op_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  Conv2dOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~Conv2dOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+  /// Method to profile an initialized CUTLASS operation
+  virtual Status profile_cutlass_(
+    double &runtime,
+    Options const &options,
+    library::Operation const *operation,
+    void *arguments,
+    void *host_workspace,
+    void *device_workspace);
+ 
+ 
+  /// Initialize reduction problem dimenstions and library::Operation
+  bool initialize_reduction_configuration_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::ConvDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against host reference
+  bool verify_with_host_reference_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against device reference
+  bool verify_with_device_reference_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+#if CUTLASS_ENABLE_CUDNN
+
+  /// Verifies CUTLASS against cudnn reference
+  bool verify_with_cudnn_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+#endif //#if CUTLASS_ENABLE_CUDNN
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/conv3d_operation_profiler.cu b/tools/profiler/src/conv3d_operation_profiler.cu
new file mode 100644
index 0000000000..67f21d8f7a
--- /dev/null
+++ b/tools/profiler/src/conv3d_operation_profiler.cu
@@ -0,0 +1,1345 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Convolution 3D profiling
+
+*/
+
+#include <iostream>
+#include <stdexcept>
+#include <iomanip>
+#include <ios>
+
+#include "cutlass/core_io.h"
+
+#include "conv3d_operation_profiler.h"
+#include "gpu_timer.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+using namespace cutlass::library;
+
+namespace cutlass {
+namespace profiler {
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Ctor
+Conv3dOperationProfiler::Conv3dOperationProfiler(Options const &options): 
+  OperationProfiler(
+    options,
+    library::OperationKind::kConv3d, 
+    {
+      {ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"},
+      {ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"d", "input_d"}, "Input D dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"h", "input_h"}, "Input H dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"w", "input_w"}, "Input W dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"c", "input_c"}, "Input C dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"k", "filter_k"}, "Filter K dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"t", "filter_t"}, "Filter T dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"r", "filter_r"}, "Filter R dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"s", "filter_s"}, "Filter S dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"z", "output_z"}, "Output Z dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"p", "output_p"}, "Output P dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"q", "output_q"}, "Output Q dimension of the Conv3d problem space"},
+      {ArgumentTypeID::kInteger, {"pad_d"}, "Padding in D direction"},
+      {ArgumentTypeID::kInteger, {"pad_h"}, "Padding in H direction"},
+      {ArgumentTypeID::kInteger, {"pad_w"}, "Padding in W direction"},
+      {ArgumentTypeID::kInteger, {"stride_d"}, "Stride in D direction"},
+      {ArgumentTypeID::kInteger, {"stride_h"}, "Stride in H direction"},
+      {ArgumentTypeID::kInteger, {"stride_w"}, "Stride in W direction"},
+      {ArgumentTypeID::kInteger, {"dilation_d"}, "Dilation in D direction"},
+      {ArgumentTypeID::kInteger, {"dilation_h"}, "Dilation in H direction"},
+      {ArgumentTypeID::kInteger, {"dilation_w"}, "Dilation in W direction"},
+      {ArgumentTypeID::kTensor, {"Activation"}, "Tensor storing the Activation operand"},
+      {ArgumentTypeID::kTensor, {"Filter"}, "Tensor storing the Filter operand"},
+      {ArgumentTypeID::kTensor, {"Output"}, "Tensor storing the Output operand"},
+      {ArgumentTypeID::kEnumerated, {"conv_mode"}, "Convolution filter mode (conv, cross)"},
+      {ArgumentTypeID::kEnumerated, {"iterator_algorithm", "iterator_algo"}, "Convolution iterator algorithm (analytic, optimized)"},
+      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
+      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
+      {ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "SplitK mode for serial or parallel reduction (serial, parallel)"},
+      {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
+      {ArgumentTypeID::kEnumerated, {"eq_gemm_provider", "eq-gemm-provider"}, "Enable profiling equivalent gemm by the following providers (cutlass)"},
+    },
+    { library::Provider::kReferenceDevice, library::Provider::kReferenceHost, library::Provider::kCUDNN }
+  ) {
+
+  description_ = "      Conv3d operation. Output(Tensor5D) = alpha * Input(Tensor5D) * Filter(Tensor5D) + beta * Input(Tensor5D)";
+
+}
+
+/// Destructor
+Conv3dOperationProfiler::~Conv3dOperationProfiler() {
+
+}
+
+
+/// Prints usage statement for the math function
+void Conv3dOperationProfiler::print_usage(std::ostream &out) const {
+  out << "Conv3d" << "\n\n";
+
+  OperationProfiler::print_usage(out);
+}
+
+/// Prints examples
+void Conv3dOperationProfiler::print_examples(std::ostream &out) const {
+
+  out << "\nExamples:\n\n"
+      << "Profile a particular convolution (specify all the convolution parameters):\n"
+      << " $ cutlass_profiler --operation=Conv3d"
+            " --Activation=f16:ndhwc --Filter=f16:ndhwc --Output=f16 --accumulator-type=f32"
+            " --n=32 --d=16 --h=14 --w=14 --c=8 --k=64 --t=3 --r=3 --s=3"
+            " --pad_d=1 --pad_h=1 --pad_w=1"
+            " --stride_d=1 --stride::h=1 --stride::w=1"
+            " --dilation_d=1 --dilation::h=1 --dilation::w=1\n\n";
+}
+
+#if 0
+// used this for debugging
+static std::string byte_string(std::vector<uint8_t> const &bytes) {
+  std::stringstream ss;
+
+  ss << "0x";
+
+  for (size_t idx = bytes.size(); idx > 0; --idx) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << uint32_t(bytes.at(idx - 1));
+  }
+
+  return ss.str();
+}
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Total number of bytes loaded
+int64_t Conv3dOperationProfiler::Conv3dProblem::bytes(library::ConvDescription const &operation_desc) const {
+  cutlass::gemm::GemmCoord mnk = eq_gemm_size(operation_desc.conv_kind);
+
+ // Input bytes read and Output bytes written for the gemm problem
+  int64_t bytes_ =
+    int64_t(library::sizeof_bits(operation_desc.A.element) * mnk.m() / 8) * mnk.k() +
+    int64_t(library::sizeof_bits(operation_desc.B.element) * mnk.n() / 8) * mnk.k() +
+    int64_t(library::sizeof_bits(operation_desc.C.element) * mnk.m() / 8) * mnk.n();
+
+  // Set is_beta_zero true if beta is zero
+  bool is_beta_zero = std::all_of(beta.begin(), beta.end(), [](uint8_t i) { return i==0; });
+
+  // Output bytes read for the gemm problem for non-zero beta values
+  if (!is_beta_zero) {
+    bytes_ += int64_t(library::sizeof_bits(operation_desc.C.element) * mnk.m() / 8) * mnk.n();
+  }
+
+  return bytes_;
+}
+
+/// Total number of flops computed
+int64_t Conv3dOperationProfiler::Conv3dProblem::flops(
+  library::ConvDescription const &operation_desc) const {
+
+  cutlass::gemm::GemmCoord mnk = eq_gemm_size(operation_desc.conv_kind);
+
+  int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2;
+  int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2;
+  
+  // Adjust mainloop flop for dgrad strided
+  if (operation_desc.conv_kind == library::ConvKind::kDgrad) {
+    flops_mainloop_ = flops_mainloop_ / ( stride_d * stride_h * stride_w);
+  }
+
+  return (flops_mainloop_ + flops_epilogue_);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Extracts the problem dimensions
+Status Conv3dOperationProfiler::initialize_configuration(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  library::ConvDescription const &operation_desc = 
+    static_cast<library::ConvDescription const &>(operation->description());
+
+  if (!arg_as_int(problem_.n, "n", problem_space, problem)) {
+    // default value
+    problem_.n = 1;
+  }
+
+  if (!arg_as_int(problem_.d, "d", problem_space, problem)) {
+    // default value
+    problem_.d = 8;
+  }
+
+  if (!arg_as_int(problem_.h, "h", problem_space, problem)) {
+    // default value
+    problem_.h = 14;
+  }
+  
+  if (!arg_as_int(problem_.w, "w", problem_space, problem)) {
+    // default value
+    problem_.w = 14;
+  }
+
+  if (!arg_as_int(problem_.c, "c", problem_space, problem)) {
+    // default value
+    problem_.c = 32;
+  }
+
+  if (!arg_as_int(problem_.k, "k", problem_space, problem)) {
+    // default value
+    problem_.k = 32;
+  }
+
+  if (!arg_as_int(problem_.t, "t", problem_space, problem)) {
+    // default value
+    problem_.t = 3;
+  }
+
+  if (!arg_as_int(problem_.r, "r", problem_space, problem)) {
+    // default value
+    problem_.r = 3;
+  }
+  
+  if (!arg_as_int(problem_.s, "s", problem_space, problem)) {
+    // default value
+    problem_.s = 3;
+  }
+
+  if (!arg_as_int(problem_.pad_d, "pad_d", problem_space, problem)) {
+    // default value
+    problem_.pad_d = 1;
+  }
+
+  if (!arg_as_int(problem_.pad_w, "pad_w", problem_space, problem)) {
+    // default value
+    problem_.pad_w = 1;
+  }
+  if (!arg_as_int(problem_.pad_h, "pad_h", problem_space, problem)) {
+    // default value
+    problem_.pad_h = 1;
+  }
+
+  if (!arg_as_int(problem_.stride_d, "stride_d", problem_space, problem)) {
+    // default value
+    problem_.stride_d = 1;
+  }
+
+  if (!arg_as_int(problem_.stride_h, "stride_h", problem_space, problem)) {
+    // default value
+    problem_.stride_h = 1;
+  }
+
+  if (!arg_as_int(problem_.stride_w, "stride_w", problem_space, problem)) {
+    // default value
+    problem_.stride_w = 1;
+  }
+
+  if (!arg_as_int(problem_.dilation_d, "dilation_d", problem_space, problem)) {
+    // default value
+    problem_.dilation_d = 1;
+  }
+
+  if (!arg_as_int(problem_.dilation_h, "dilation_h", problem_space, problem)) {
+    // default value
+    problem_.dilation_h = 1;
+  }
+
+  if (!arg_as_int(problem_.dilation_w, "dilation_w", problem_space, problem)) {
+    // default value
+    problem_.dilation_w = 1;
+  }
+
+  ////////////////////////  Convolution output dimensions p and q ////////////////////////
+  // Cutlass convolutions support arbitrary output sizes and not constriant by          //
+  // input, filter, padding, striding, dilation sizes.                                  //
+  // cuDNN sets the output dimensions (p, q)  using following equations:                //
+  //                                                                                    //
+  // output = div_up(input + 2 * pad - ((filter - 1) * dilation + 1) + 1, stride)       //
+  // where; div_up(a, b) : (a - 1)/b + 1                                                //
+  //                                                                                    //
+  // Thus, when output p and q dimensions are unspecified by the user                   //
+  // cutlass profiler sets p and q which are cuDNN compliant.                           //
+  //                                                                                    //
+  ////////////////////////////////////////////////////////////////////////////////////////
+  // set convolution output z 
+  if (!arg_as_int(problem_.z, "z", problem_space, problem)) {
+    // default value (set using cudnn formula for output height, when p is not provided)
+    problem_.z = (
+                    problem_.d + 
+                    2 * problem_.pad_d - 
+                    ((problem_.t - 1) * problem_.dilation_d + 1)
+                 ) / (problem_.stride_d) 
+                + 1;
+  }
+
+  // set convolution output p 
+  if (!arg_as_int(problem_.p, "p", problem_space, problem)) {
+    // default value (set using cudnn formula for output height, when p is not provided)
+    problem_.p = (
+                    problem_.h + 
+                    2 * problem_.pad_h - 
+                    ((problem_.r - 1) * problem_.dilation_h + 1)
+                 ) / (problem_.stride_h) 
+                + 1;
+  }
+
+  // set convolution output q
+  if (!arg_as_int(problem_.q, "q", problem_space, problem)) {
+    // default value (set using cudnn formula for output width, when q is not provided)
+    problem_.q = (
+                    problem_.w + 
+                    2 * problem_.pad_w - 
+                    ((problem_.s - 1) * problem_.dilation_w + 1)
+                 ) / (problem_.stride_w) 
+                + 1;
+  }
+  /////////////////////////////////////////////////////////////////////////////////////////
+
+
+  if (!arg_as_SplitKModeID(problem_.split_k_mode, "split_k_mode", problem_space, problem)) {
+    // default value
+    problem_.split_k_mode = library::SplitKMode::kSerial;
+  }
+
+  if (!arg_as_int(problem_.split_k_slices, "split_k_slices", problem_space, problem)) {
+    // default value
+    problem_.split_k_slices = 1;
+  }
+  
+  if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) {
+    // default value
+    problem_.conv_mode = library::ConvModeID::kCrossCorrelation;
+  }
+
+  if (!arg_as_ProviderID(problem_.eq_gemm_provider, "eq_gemm_provider", problem_space, problem)) {
+    // default value
+    problem_.eq_gemm_provider = library::Provider::kNone;
+  }
+
+  if (!conv_kind_satisfies(operation_desc.conv_kind, "conv_kind", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!iterator_algorithm_satisfies(operation_desc.iterator_algorithm, "iterator_algorithm", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.activation(), "Activation", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.filter(), "Filter", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.output(), "Output", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!arg_as_scalar(
+    problem_.alpha, 
+    operation_desc.element_epilogue, 
+    "alpha", 
+    problem_space, 
+    problem)) {
+
+    if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) {
+      return Status::kErrorInternal;
+    }
+  }
+  
+  if (!arg_as_scalar(
+    problem_.beta, 
+    operation_desc.element_epilogue, 
+    "beta", 
+    problem_space, 
+    problem)) {
+    
+    if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) {
+      return Status::kErrorInternal;
+    }
+  }
+
+  // initialize library::ConvConfiguration
+  conv_workspace_.configuration.problem_size = conv::Conv3dProblemSize(
+                                                int(problem_.n),
+                                                int(problem_.d),
+                                                int(problem_.h),
+                                                int(problem_.w),
+                                                int(problem_.c),
+                                                int(problem_.k),
+                                                int(problem_.t),
+                                                int(problem_.r),
+                                                int(problem_.s),
+                                                int(problem_.z),
+                                                int(problem_.p),
+                                                int(problem_.q),
+                                                int(problem_.pad_d),
+                                                int(problem_.pad_h),
+                                                int(problem_.pad_w),
+                                                int(problem_.stride_d),
+                                                int(problem_.stride_h),
+                                                int(problem_.stride_w),
+                                                int(problem_.dilation_d),
+                                                int(problem_.dilation_h),
+                                                int(problem_.dilation_w),
+                                                static_cast<conv::Mode>(static_cast<int>(problem_.conv_mode)),
+                                                int(problem_.split_k_slices),
+                                                1 // groups
+                                              );
+  
+  conv_workspace_.configuration.split_k_mode = static_cast<conv::SplitKMode>(static_cast<int>(problem_.split_k_mode));
+
+  conv_workspace_.configuration.layout_activations.stride() = make_Coord(
+    int(problem_.c), 
+    int(problem_.w) * int(problem_.c),
+    int(problem_.h) * int(problem_.w) * int(problem_.c),
+    int(problem_.d) * int(problem_.h) * int(problem_.w) * int(problem_.c)
+  );
+
+  conv_workspace_.configuration.layout_filters.stride() = make_Coord(
+    int(problem_.c), 
+    int(problem_.s) * int(problem_.c),
+    int(problem_.r) * int(problem_.s) * int(problem_.c),
+    int(problem_.t) * int(problem_.r) * int(problem_.s) * int(problem_.c)
+  );
+
+  conv_workspace_.configuration.layout_output.stride() = make_Coord(
+    int(problem_.k), 
+    int(problem_.q) * int(problem_.k),
+    int(problem_.q) * int(problem_.p) * int(problem_.k),
+    int(problem_.z) * int(problem_.q) * int(problem_.p) * int(problem_.k)
+  );
+
+
+  // initialize library::ConvArguments
+  conv_workspace_.arguments.A            = nullptr;
+  conv_workspace_.arguments.B            = nullptr;
+  conv_workspace_.arguments.C            = nullptr;
+  conv_workspace_.arguments.D            = nullptr;
+  conv_workspace_.arguments.alpha        = problem_.alpha.data();
+  conv_workspace_.arguments.beta         = problem_.beta.data();
+  conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  // initialize reduction operation for parallel splitKMode not supported for conv3d
+  if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    if(!initialize_reduction_configuration_(options, report, device_context, operation, problem_space, problem)) {
+      return Status::kErrorInternal;
+    }
+  }
+
+  initialize_result_(this->model_result_, options, operation_desc, problem_space);
+
+  return operation->can_implement(&conv_workspace_.configuration, &conv_workspace_.arguments);
+}
+
+/// Initializes the performance result
+void Conv3dOperationProfiler::initialize_result_(
+  PerformanceResult &result,
+  Options const &options,  
+  library::ConvDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.provider = library::Provider::kCUTLASS;
+  result.disposition = Disposition::kNotRun;
+  result.status = Status::kSuccess;
+  result.operation_name = operation_desc.name;
+
+  result.arguments.resize(problem_space.rank());
+
+  set_argument(result, "Activation", problem_space,
+    std::string(library::to_string(operation_desc.activation().element)) 
+    + ":" + library::to_string(operation_desc.activation().layout));
+
+  set_argument(result, "Filter", problem_space,
+    std::string(library::to_string(operation_desc.filter().element)) 
+    + ":" + library::to_string(operation_desc.filter().layout));
+
+  set_argument(result, "Output", problem_space,
+    std::string(library::to_string(operation_desc.output().element)) 
+    + ":" + library::to_string(operation_desc.output().layout));
+
+  set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind));
+
+  set_argument(result, "iterator_algorithm", problem_space, std::string(library::to_string(operation_desc.iterator_algorithm)));
+
+  set_argument(result, "n", problem_space, problem_.n);
+  set_argument(result, "d", problem_space, problem_.d);
+  set_argument(result, "h", problem_space, problem_.h);
+  set_argument(result, "w", problem_space, problem_.w);
+  set_argument(result, "c", problem_space, problem_.c);
+
+  set_argument(result, "k", problem_space, problem_.k);
+  set_argument(result, "t", problem_space, problem_.t);
+  set_argument(result, "r", problem_space, problem_.r);
+  set_argument(result, "s", problem_space, problem_.s);
+  
+  set_argument(result, "z", problem_space, problem_.z);
+  set_argument(result, "p", problem_space, problem_.p);
+  set_argument(result, "q", problem_space, problem_.q);
+
+  set_argument(result, "pad_d", problem_space, problem_.pad_d);
+  set_argument(result, "pad_h", problem_space, problem_.pad_h);
+  set_argument(result, "pad_w", problem_space, problem_.pad_w);
+
+  set_argument(result, "stride_d", problem_space, problem_.stride_d);
+  set_argument(result, "stride_h", problem_space, problem_.stride_h);
+  set_argument(result, "stride_w", problem_space, problem_.stride_w);
+
+  set_argument(result, "dilation_d", problem_space, problem_.dilation_d);
+  set_argument(result, "dilation_h", problem_space, problem_.dilation_h);
+  set_argument(result, "dilation_w", problem_space, problem_.dilation_w);
+
+  set_argument(result, "split_k_mode", problem_space, 
+    std::string(library::to_string(problem_.split_k_mode)));
+  set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices);
+
+  set_argument(result, "conv_mode", problem_space, 
+    std::string(library::to_string(problem_.conv_mode)));
+
+  set_argument(result, "alpha", problem_space,
+    library::lexical_cast(problem_.alpha, operation_desc.element_epilogue));
+
+  set_argument(result, "beta", problem_space,
+    library::lexical_cast(problem_.beta, operation_desc.element_epilogue));
+
+  set_argument(result, "eq_gemm_provider", problem_space, 
+    std::string(library::to_string(problem_.eq_gemm_provider)));
+
+  OperationProfiler::initialize_result_(result, operation_desc, problem_space);
+
+  // Bytes of activation, filter, and output tensors
+  result.bytes = problem_.bytes(operation_desc);
+
+  // Theoritical flops required for the computation
+  result.flops = problem_.flops(operation_desc);
+
+  // Measured runtime
+  result.runtime = 0;
+
+}
+
+/// Initialize reduction problem dimenstions and library::Operation
+bool Conv3dOperationProfiler::initialize_reduction_configuration_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  library::ConvDescription const &conv_desc = 
+    static_cast<library::ConvDescription const &>(operation->description());
+
+  library::ConvKind const &conv_kind = conv_desc.conv_kind;
+
+  if (!cast_from_double(problem_.alpha_one, conv_desc.element_epilogue, 1)) {
+   return false;
+  }
+
+  if (!cast_from_double(problem_.beta_zero, conv_desc.element_epilogue, 0)) {
+   return false;
+  }
+
+  /// This chooses the appropriate stride element of the row-major C tensor.
+  int const & tensor_c_stride_idx = (conv_kind == library::ConvKind::kWgrad ? 3 : 0);
+
+  /// intialize library::ReductionConfiguration
+  conv_workspace_.reduction_configuration.problem_size     = problem_.eq_gemm_size(conv_kind).mn();
+  conv_workspace_.reduction_configuration.partitions       = int(problem_.split_k_slices);
+  conv_workspace_.reduction_configuration.partition_stride = problem_.eq_gemm_size(conv_kind).mn().product();
+  conv_workspace_.reduction_configuration.ldw              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
+  conv_workspace_.reduction_configuration.lds              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
+  conv_workspace_.reduction_configuration.ldd              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
+
+  // find reduction operation 
+  library::ReductionFunctionalKey reduction_key(
+    library::Provider::kCUTLASS,
+    conv_desc.tile_description.math_instruction.element_accumulator,  // element workspace 
+    conv_desc.tile_description.math_instruction.element_accumulator,  // element accumulator
+    conv_desc.C.element,                                              // element output
+    conv_desc.element_epilogue                                        // element compute
+  ); 
+
+#if 0// debug print to check which reduction instance is selected
+    std::cout << reduction_key << "\n";
+#endif
+  auto reduction_it = Singleton::get().operation_table.reduction_operations.find(reduction_key);
+
+  if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) {
+
+    return false;
+  }    
+
+  // initialize reduction operation required for parallel split-k conv2d operator
+  reduction_op_ = reduction_it->second;
+
+  // reduction operation found and initialized
+  return true;
+}
+
+
+/// Initializes workspace
+Status Conv3dOperationProfiler::initialize_workspace(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  // initialize conv2d underlying operation to handle parallel reduction
+  library::Operation const* underlying_operation = operation;
+
+  if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) {
+      return Status::kErrorNotSupported;
+    }
+  }
+
+  library::ConvDescription const &operation_desc = 
+    static_cast<library::ConvDescription const &>(underlying_operation->description());
+
+  // Compute the number of copies of the problem to avoid L2 camping.
+  if (!options.profiling.workspace_count) {
+    int64_t bytes = problem_.bytes(operation_desc);
+    if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
+      conv_workspace_.problem_count =
+        1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
+    }
+    else {
+      conv_workspace_.problem_count = 1;
+    }
+  }
+  else {
+    conv_workspace_.problem_count = options.profiling.workspace_count;
+  }
+
+
+  if (options.execution_mode != ExecutionMode::kDryRun) {
+
+    conv_workspace_.A = device_context.allocate_tensor(
+      options,
+      "A",
+      operation_desc.A.element,
+      operation_desc.A.layout,
+      problem_.extent_a(operation_desc.conv_kind),
+      conv_workspace_.stride_a(operation_desc.conv_kind),
+      conv_workspace_.problem_count
+    );
+
+    conv_workspace_.B = device_context.allocate_tensor(
+      options,
+      "B",
+      operation_desc.B.element,
+      operation_desc.B.layout,
+      problem_.extent_b(operation_desc.conv_kind),
+      conv_workspace_.stride_b(operation_desc.conv_kind),
+      conv_workspace_.problem_count
+    );
+
+    conv_workspace_.C = device_context.allocate_tensor(
+      options,
+      "C",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      problem_.extent_c(operation_desc.conv_kind),
+      conv_workspace_.stride_c(operation_desc.conv_kind),
+      conv_workspace_.problem_count
+    );
+
+    conv_workspace_.Computed = device_context.allocate_tensor(
+      "D",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      problem_.extent_c(operation_desc.conv_kind),
+      conv_workspace_.stride_c(operation_desc.conv_kind),
+      conv_workspace_.problem_count
+    );
+
+    conv_workspace_.Reference = device_context.allocate_tensor(
+      "Reference",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      problem_.extent_c(operation_desc.conv_kind),
+      conv_workspace_.stride_c(operation_desc.conv_kind),
+      conv_workspace_.problem_count
+    );
+    
+  }
+
+  //
+  // Initialize the CUTLASS operation
+  //
+  Status status = Status::kSuccess;
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    if (options.execution_mode != ExecutionMode::kDryRun) {
+
+      uint64_t workspace_size = underlying_operation->get_host_workspace_size(&conv_workspace_.configuration);
+      conv_workspace_.host_workspace.resize(workspace_size, 0);
+
+      workspace_size = underlying_operation->get_device_workspace_size(&conv_workspace_.configuration);
+      conv_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
+
+      status = underlying_operation->initialize(
+        &conv_workspace_.configuration,
+        conv_workspace_.host_workspace.data(),
+        conv_workspace_.device_workspace.data());
+
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+        workspace_size = reduction_op_->get_host_workspace_size(&conv_workspace_.reduction_configuration);
+        conv_workspace_.reduction_host_workspace.resize(workspace_size, 0);
+
+        status = reduction_op_->initialize(
+          &conv_workspace_.reduction_configuration, 
+          conv_workspace_.reduction_host_workspace.data(), 
+          nullptr);
+        
+        if (status != Status::kSuccess) {
+          return status;
+        }
+      }
+    }
+
+    //
+    // If CUTLASS is enabled, generate a result for it
+    //
+    results_.push_back(model_result_);
+    results_.back().provider = library::Provider::kCUTLASS;
+    results_.back().op_kind = library::OperationKind::kConv3d;
+    results_.back().disposition = Disposition::kNotRun;
+
+    for(auto provider : verification_providers_) {
+      results_.back().verification_map[provider] = Disposition::kNotRun;
+    }
+  }
+
+  return status;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool Conv3dOperationProfiler::verify_cutlass(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (!options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+    return true;
+  }
+
+  if (options.execution_mode == ExecutionMode::kDryRun) {
+    return true;
+  }
+
+  cudaError_t result;
+
+  // Initialize structure containing Conv arguments
+  set_cutlass_operator_arguments_();
+
+  conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data());
+  
+  //
+  // Run the CUTLASS operation
+  //
+  // initialize conv2d underlying operation to handle parallel reduction
+  library::Operation const* underlying_operation = operation;
+
+  if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) {
+      results_.back().disposition = Disposition::kFailed;
+      return false;
+    }
+  }
+
+#if 0
+  std::cout << "profiling         : " << std::endl 
+            << "conv2d            : " << operation->description().name << std::endl 
+            << "underlying conv2d : " << underlying_operation->description().name << std::endl 
+            << "reduction         : " << reduction_op_->description().name << std::endl;
+#endif
+
+  // run cutlass conv2d operation
+  results_.back().status = underlying_operation->run(
+    &conv_workspace_.arguments,
+    conv_workspace_.host_workspace.data(),
+    conv_workspace_.device_workspace.data());
+
+  if (results_.back().status != Status::kSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  // Run parallel reduction kernel for parallel split_k_mode
+  if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    
+    results_.back().status = reduction_op_->run(
+      &conv_workspace_.reduction_arguments,
+      conv_workspace_.reduction_host_workspace.data(),
+      nullptr);
+
+    if (results_.back().status != Status::kSuccess) {
+      results_.back().disposition = Disposition::kFailed;
+      return false;
+    }
+
+  }
+
+  // Synchronize before running device reference
+  result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  // CUTLASS op ran the but not yet verified against any verification provider
+  results_.back().disposition = Disposition::kNotVerified;
+  
+  //
+  // Run verification providers
+  //
+
+  if (options.verification.enabled) {
+
+#if CUTLASS_ENABLE_CUDNN
+    // Run verification cudnn reference
+    if (options.verification.provider_enabled(library::Provider::kCUDNN)) {
+
+      // Guard against unsupported cases
+      auto const & conv_desc = static_cast<library::ConvDescription const &>(operation->description());
+
+      Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration);
+
+      // Initialize reference data to the source data 
+      conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
+
+      if (status == Status::kSuccess) {
+        // call cudnn verification if supported
+        verify_with_cudnn_(
+          options,
+          report,
+          device_context,
+          operation,
+          problem_space,
+          problem);
+      }
+
+      else if (status == Status::kErrorInvalidProblem) {
+        results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kInvalidProblem;
+      }
+
+      else {
+        // set verification map for cudnn to not supported
+        results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kNotSupported;
+      }
+    }
+#endif // #if CUTLASS_ENABLE_CUDNN
+
+    // Run verification host reference
+    if (options.verification.provider_enabled(library::Provider::kReferenceHost)) {
+      
+      // Restore reference data back to initial source data 
+      conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
+
+      verify_with_host_reference_(
+        options,
+        report,
+        device_context,
+        operation,
+        problem_space,
+        problem);      
+    }
+
+    // Update disposition to worst case verification outcome among all 
+    // verification providers which are supported
+    bool is_any_verification_run_passed = false;
+    for(auto &m : results_.back().verification_map) {
+      if(m.second == Disposition::kFailed || m.second == Disposition::kIncorrect) {
+        results_.back().disposition = m.second;
+        return true;
+      }
+      if(!is_any_verification_run_passed && m.second == Disposition::kPassed) {
+        is_any_verification_run_passed = true;
+      }
+    }
+
+    if(is_any_verification_run_passed) {
+      results_.back().disposition = Disposition::kPassed;
+    }
+  }
+
+  // Return true means continue profiling
+  return true;
+}
+
+
+/// Verifies CUTLASS against host reference
+bool Conv3dOperationProfiler::verify_with_host_reference_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  Status status;
+
+  //
+  // Find host reference operation using conv functional description key
+  //
+  library::OperationDescription const &desc = operation->description();
+
+  auto &conv_desc = static_cast<library::ConvDescription const &>(desc);
+
+  library::ConvFunctionalKey conv_key(
+    library::Provider::kReferenceHost,
+    conv_desc.conv_kind,        
+    conv_desc.A.element,
+    conv_desc.A.layout,
+    conv_desc.B.element,
+    conv_desc.B.layout,
+    conv_desc.C.element,
+    conv_desc.C.layout,
+    conv_desc.tile_description.math_instruction.element_accumulator, 
+    conv_desc.element_epilogue);
+
+#if 0 // debug print to check which host refererence instance is selected
+    std::cout << conv_key << "\n";
+#endif
+
+  auto operators_it = Singleton::get().operation_table.conv3d_operations.find(conv_key);
+
+  if(operators_it == Singleton::get().operation_table.conv3d_operations.end()) {
+
+    results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
+    return true;
+  }    
+
+  // conv3d host reference minimum cc is 0 (CPU) and no iterator algorithm
+  library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone);
+  auto cc_it = operators_it->second.find(preference_key);
+  
+  if(cc_it == operators_it->second.end()) {
+    results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
+    return true;
+  }
+
+  // host refernce has only one instances in ConvOperationVectorMap
+  library::Operation const *reference_op = cc_it->second[0];
+
+  //
+  // Copy input tensors A, B, and C from device to host buffers
+  //
+  conv_workspace_.host_tensor_a.resize(conv_workspace_.A->bytes());
+  conv_workspace_.host_tensor_b.resize(conv_workspace_.B->bytes());
+  conv_workspace_.host_tensor_c.resize(conv_workspace_.C->bytes());
+  conv_workspace_.A->copy_to_host(conv_workspace_.host_tensor_a.data());
+  conv_workspace_.B->copy_to_host(conv_workspace_.host_tensor_b.data());
+  conv_workspace_.C->copy_to_host(conv_workspace_.host_tensor_c.data());
+
+  //
+  // Initialize structure containing Conv3d arguments
+  //
+  conv_workspace_.arguments.A = conv_workspace_.host_tensor_a.data();
+  conv_workspace_.arguments.B = conv_workspace_.host_tensor_b.data();
+  conv_workspace_.arguments.C = conv_workspace_.host_tensor_c.data();
+  conv_workspace_.arguments.D = conv_workspace_.host_tensor_c.data();
+  conv_workspace_.arguments.alpha = problem_.alpha.data();
+  conv_workspace_.arguments.beta = problem_.beta.data();
+  conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  //
+  // Intialize host reference operation
+  //
+  std::vector<uint8_t> host_workspace_reference_op;
+
+  uint64_t workspace_size = reference_op->get_host_workspace_size(&conv_workspace_.configuration);
+  host_workspace_reference_op.resize(workspace_size, 0);
+
+  reference_op->initialize(
+    &conv_workspace_.configuration,
+    host_workspace_reference_op.data());
+
+  //
+  // Run host reference operation
+  //
+  status = reference_op->run(
+    &conv_workspace_.arguments,
+    host_workspace_reference_op.data());
+
+  // Handle errors
+  if (status != Status::kSuccess) {
+    results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotVerified;
+    return true;
+  }
+
+  //
+  // Copy host reference output to device memory for equality check on device
+  //
+  conv_workspace_.Reference->copy_from_host(conv_workspace_.arguments.D);
+
+  //
+  // Verify results
+  //
+  results_.back().verification_map[library::Provider::kReferenceHost] = compare_tensors(
+    options,
+    *conv_workspace_.Computed,
+    *conv_workspace_.Reference,
+    conv_workspace_.Computed->batch_stride()
+  );
+
+  // Save workspace if incorrect
+  if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) {
+  
+    save_workspace(
+      device_context,
+      options,
+      static_cast<library::ConvDescription const &>(operation->description()),
+      library::Provider::kCUTLASS,
+      library::Provider::kReferenceHost);
+  }
+
+  // Return true means continue profiling
+  return true;
+}
+
+
+/// Verifies CUTLASS against host reference
+bool Conv3dOperationProfiler::verify_with_device_reference_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  // TODO: verify cutlass conv3d against device reference
+
+  // Return true means continue profiling
+  return true;
+}
+
+/// Measures performance results
+bool Conv3dOperationProfiler::profile(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    set_cutlass_operator_arguments_();
+
+    results_.back().status = profile_cutlass_(
+      results_.back().runtime,
+      options,
+      operation,
+      &conv_workspace_.arguments,
+      conv_workspace_.host_workspace.data(),
+      conv_workspace_.device_workspace.data()
+    );
+  }
+  return true;
+
+}
+
+/// Updates the arguments structure for the CUTLASS operator based on
+/// the problem index.
+void Conv3dOperationProfiler::set_cutlass_operator_arguments_(int problem_idx) {
+  // Initialize structure containing Conv3d arguments
+  conv_workspace_.arguments.A = conv_workspace_.A->batch_data(problem_idx);
+  conv_workspace_.arguments.B = conv_workspace_.B->batch_data(problem_idx);
+  conv_workspace_.arguments.C = conv_workspace_.C->batch_data(problem_idx);
+  conv_workspace_.arguments.D = conv_workspace_.Computed->batch_data(problem_idx);
+  conv_workspace_.arguments.alpha = problem_.alpha.data();
+  conv_workspace_.arguments.beta = problem_.beta.data();
+  conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    // update library::ConvArguments for parallel split-k reduction
+    conv_workspace_.arguments.D = conv_workspace_.device_workspace.data();
+    conv_workspace_.arguments.alpha = problem_.alpha_one.data();
+    conv_workspace_.arguments.beta = problem_.beta_zero.data();
+
+    /// intialize library::ReductionArguments
+    conv_workspace_.reduction_arguments.workspace           = conv_workspace_.device_workspace.data();
+    conv_workspace_.reduction_arguments.source              = conv_workspace_.C->batch_data(problem_idx);
+    conv_workspace_.reduction_arguments.destination         = conv_workspace_.Computed->batch_data(problem_idx);
+    conv_workspace_.reduction_arguments.alpha               = problem_.alpha.data();
+    conv_workspace_.reduction_arguments.beta                = problem_.beta.data();
+    conv_workspace_.reduction_arguments.pointer_mode        = library::ScalarPointerMode::kHost;
+  }
+}
+
+/// Method to profile a CUTLASS Operation
+Status Conv3dOperationProfiler::profile_cutlass_(
+  double &runtime,
+  Options const &options,
+  library::Operation const *operation,
+  void *arguments,
+  void *host_workspace,
+  void *device_workspace) {
+
+  GpuTimer timer;
+
+  // initialize conv2d underlying operation to handle parallel reduction
+  library::Operation const* underlying_operation = operation;
+
+  if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+    if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) {
+      return Status::kErrorNotSupported;
+    }
+  }
+
+  //
+  // Optional sleep to limit power consumption and thermals
+  //
+
+  sleep(options.profiling.sleep_duration);
+
+  //
+  // Warmup loop
+  //
+
+  Status status;
+
+  for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) {
+
+    // Setup rotating workspace
+    int workspace_idx = options.profiling.warmup_iterations + iteration;
+    int problem_idx = (workspace_idx % conv_workspace_.problem_count);
+
+    set_cutlass_operator_arguments_(problem_idx);
+
+    // Run underlying conv2d operation
+    status = underlying_operation->run(
+      arguments,
+      host_workspace,
+      device_workspace);
+
+    // Run parallel reduction kernel for parallel split_k_mode
+    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
+
+      status = reduction_op_->run(
+        &conv_workspace_.reduction_arguments,
+        conv_workspace_.reduction_host_workspace.data(),
+        nullptr);
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+  }
+  
+  //
+  // Initialize GPU timer
+  //
+
+  timer.start();
+
+  //
+  // Profiling loop
+  //
+
+  int Iterations = options.profiling.iterations;
+
+  int iteration = 0;
+  for (; iteration < Iterations; ++iteration) {
+
+    // Setup rotating workspace
+    int problem_idx = (iteration % conv_workspace_.problem_count);
+ 
+    set_cutlass_operator_arguments_(problem_idx);
+ 
+    // Run underlying conv2d operation
+    status = underlying_operation->run(
+      arguments,
+      host_workspace,
+      device_workspace);
+
+    // Run parallel reduction kernel for parallel split_k_mode
+    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {      
+      status = reduction_op_->run(
+        &conv_workspace_.reduction_arguments,
+        conv_workspace_.reduction_host_workspace.data(),
+        nullptr);
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+  }
+
+  //
+  // Wait for completion
+  //
+
+  timer.stop_and_wait();
+
+  //
+  // Update performance result
+  //
+  
+  runtime = timer.duration(iteration);
+
+  return status;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#if CUTLASS_ENABLE_CUDNN
+
+/// Verifies CUTLASS against cudnn reference
+bool Conv3dOperationProfiler::verify_with_cudnn_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  auto &conv_desc = static_cast<library::ConvDescription const &>(operation->description());
+
+  //
+  // Construct cudnn operators
+  //
+
+  CudnnCreate handle;
+  cudnnStatus_t status = handle.get_cudnn_create_status();
+
+  if (status != CUDNN_STATUS_SUCCESS) {
+    
+    results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
+    return true;
+  }
+
+  //
+  // Initialize state
+  //
+
+  // Initialize structure containing Conv2d arguments
+  conv_workspace_.arguments.A = conv_workspace_.A->data();
+  conv_workspace_.arguments.B = conv_workspace_.B->data();
+  conv_workspace_.arguments.D = conv_workspace_.Reference->data();
+  conv_workspace_.arguments.alpha = problem_.alpha.data();
+  conv_workspace_.arguments.beta = problem_.beta.data();
+  conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  // cuDNN does not support four tensor arguments, so we copy the tensor C data into
+  // tensor D.
+  conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
+  conv_workspace_.arguments.C = conv_workspace_.arguments.D;
+
+  try {
+
+    //
+    // Construct dispatcher to cudnn operator
+    //
+
+    detail::cudnnConvDispatcher conv_op( 
+      conv_desc, 
+      conv_workspace_.configuration,
+      conv_workspace_.arguments,
+      handle
+    );
+
+    if (conv_op.status != Status::kSuccess) {
+      if (conv_op.status == Status::kErrorNotSupported) {
+        results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kNotSupported;
+
+      } else {
+        results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kFailed;
+      }
+      return true;
+    }
+
+
+    status = conv_op(handle);
+
+    // Handle errors
+    if (status != CUDNN_STATUS_SUCCESS) {
+
+      results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
+      return true;
+    }
+
+    //
+    // Verify results
+    //
+
+    results_.back().verification_map[library::Provider::kCUDNN] = compare_tensors(
+      options,
+      *conv_workspace_.Computed,
+      *conv_workspace_.Reference
+    );
+
+    // Save workspace if incorrect
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+      results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) {
+
+      save_workspace(
+        device_context,
+        options,
+        conv_desc,
+        library::Provider::kCUTLASS,
+        library::Provider::kCUDNN);
+    }
+  }
+  catch (...) {
+    results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kFailed;
+  }
+
+  // Return true means continue profiling
+  return true;
+
+}
+
+#endif // #if CUTLASS_ENABLE_CUDNN
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/conv3d_operation_profiler.h b/tools/profiler/src/conv3d_operation_profiler.h
new file mode 100644
index 0000000000..04c2a15e82
--- /dev/null
+++ b/tools/profiler/src/conv3d_operation_profiler.h
@@ -0,0 +1,441 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines profiling functionality for convolution
+
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/handle.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/singleton.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+#include "reduction_operation_profiler.h"
+#if CUTLASS_ENABLE_CUDNN
+#include "cudnn_helpers.h"
+#endif //#if CUTLASS_ENABLE_CUDNN
+#include "debug.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class Conv3dOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct Conv3dProblem {
+
+    int64_t n, d, h, w, c, z, p, q, k, t, r, s;
+    int64_t pad_d, pad_h, pad_w;
+    int64_t stride_d, stride_h, stride_w;
+    int64_t dilation_d, dilation_h, dilation_w;
+
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+
+    library::SplitKMode split_k_mode;
+    int64_t split_k_slices;
+
+    library::ConvModeID conv_mode;
+
+    library::Provider eq_gemm_provider;
+
+    // convolution with parallel interleaved reduction  
+    // convolution epilogue (alpha, beta) = (1.0, 0.0)
+    // reduction epilogue (alpha, beta) = (Conv3dProblem::alpha, Conv3dProblem::beta)
+    std::vector<uint8_t> alpha_one;
+    std::vector<uint8_t> beta_zero;
+
+    //
+    // Methods
+    //
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::ConvDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::ConvDescription const &operation_desc) const;
+
+    /// Infers output size from theinput size, padding, stride, and dilation
+    void set_default_output_size() {
+      z = ((d + pad_d - t * dilation_d) / stride_d) + 1;
+      p = ((h + pad_h - r * dilation_h) / stride_h) + 1;
+      q = ((w + pad_w - s * dilation_w) / stride_w) + 1;
+    }
+
+    // Returns equivalent gemm problem size for convolution
+    cutlass::gemm::GemmCoord eq_gemm_size(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return cutlass::gemm::GemmCoord(int(n * z * p * q), int(k), int(t * r * s * c));
+        case library::ConvKind::kDgrad: return cutlass::gemm::GemmCoord(int(n * d * h * w), int(c), int(t * r * s * k));
+        case library::ConvKind::kWgrad: return cutlass::gemm::GemmCoord(int(k), int(t * r * s * c), int(n * z * p * q));
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor A
+    std::vector<int> extent_a(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(n), int(d), int(h), int(w), int(c)};
+        case library::ConvKind::kDgrad: return {int(n), int(z), int(p), int(q), int(k)};
+        case library::ConvKind::kWgrad: return {int(n), int(z), int(p), int(q), int(k)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor B
+    std::vector<int> extent_b(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(k), int(t), int(r), int(s), int(c)};
+        case library::ConvKind::kDgrad: return {int(k), int(t), int(r), int(s), int(c)};
+        case library::ConvKind::kWgrad: return {int(n), int(d), int(h), int(w), int(c)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor C
+    std::vector<int> extent_c(library::ConvKind const &conv_kind) const {
+    
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(n), int(z), int(p), int(q), int(k)};
+        case library::ConvKind::kDgrad: return {int(n), int(d), int(h), int(w), int(c)};
+        case library::ConvKind::kWgrad: return {int(k), int(t), int(r), int(s), int(c)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix A
+    library::LayoutTypeID eq_gemm_layout_a(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return library::LayoutTypeID::kRowMajor;    // TN Gemm
+        case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor;    // TT Gemm
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor; // NT Gemm
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix B
+    library::LayoutTypeID eq_gemm_layout_b(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return library::LayoutTypeID::kColumnMajor;  // TN Gemm
+        case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor;     // TT Gemm
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kRowMajor;     // NT Gemm
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix C
+    library::LayoutTypeID eq_gemm_layout_c(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        // Gemm operator assumes column-major output
+        case library::ConvKind::kFprop:
+        case library::ConvKind::kDgrad: 
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor;
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimenstion for equivalent gemm matrix A
+    int64_t eq_gemm_lda(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimenstion for equivalent gemm matrix B
+    int64_t eq_gemm_ldb(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).n();
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).n();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimenstion for equivalent gemm matrix C
+    int64_t eq_gemm_ldc(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: 
+        case library::ConvKind::kDgrad: 
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+  };
+
+  /// Workspace used 
+  struct Conv2dWorkspace {
+
+    /// Conv device allocations
+    DeviceAllocation *A;
+    DeviceAllocation *B;
+    DeviceAllocation *C;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+    
+    /// Library configuration and arguments for convolution operator
+    library::Conv3dConfiguration configuration;
+    library::ConvArguments arguments;
+
+    /// Number of copies of the problem workspace which are visited sequentially during
+    /// profiling to avoid camping in the last level cache.
+    int problem_count;
+
+    /// Buffer used for the cutlass conv2d operations' host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the cutlass operations' device workspace
+    DeviceAllocation device_workspace;
+    
+    /// Library configuration and arguments for reduction operator
+    library::ReductionConfiguration reduction_configuration;
+    library::ReductionArguments reduction_arguments;
+
+    /// Buffer used for the cutlass reduction operations' host workspace
+    std::vector<uint8_t> reduction_host_workspace;
+  
+    /// Host data buffers for host reference operation
+    /// host buffer for tensor 
+    std::vector<uint8_t> host_tensor_a;
+
+    /// host buffer for tensor b
+    std::vector<uint8_t> host_tensor_b;
+
+    /// host buffer for tensor c
+    std::vector<uint8_t> host_tensor_c;
+
+
+    //
+    // Methods
+    //
+
+    Conv2dWorkspace(): 
+      A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
+
+      // Returns stride vector for tensor A
+      std::vector<int> stride_a(library::ConvKind const &conv_kind) {
+        return {        
+          configuration.layout_a(conv_kind).stride()[0],
+          configuration.layout_a(conv_kind).stride()[1],
+          configuration.layout_a(conv_kind).stride()[2],
+          configuration.layout_a(conv_kind).stride()[3]
+        };
+      }
+
+      // Returns stride vector for tensor B
+      std::vector<int> stride_b(library::ConvKind const &conv_kind) {
+
+        return {        
+          configuration.layout_b(conv_kind).stride()[0],
+          configuration.layout_b(conv_kind).stride()[1],
+          configuration.layout_b(conv_kind).stride()[2],
+          configuration.layout_b(conv_kind).stride()[3]
+        };
+      }
+
+      // Returns stride vector for tensor C
+      std::vector<int> stride_c(library::ConvKind const &conv_kind) {
+
+        return {        
+          configuration.layout_c(conv_kind).stride()[0],
+          configuration.layout_c(conv_kind).stride()[1],
+          configuration.layout_c(conv_kind).stride()[2],
+          configuration.layout_c(conv_kind).stride()[3]
+        };
+      }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// CONV problem obtained from problem space
+  Conv3dProblem problem_;
+
+  /// Device memory allocations 
+  Conv2dWorkspace conv_workspace_;
+
+  /// CUTLASS parallel reduction operation to follow this* conv2d operation
+  library::Operation const *reduction_op_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  Conv3dOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~Conv3dOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Updates the arguments structure for the CUTLASS operator based on
+  /// the problem index.
+  void set_cutlass_operator_arguments_(int problem_idx = 0);
+
+  /// Method to profile an initialized CUTLASS operation
+  virtual Status profile_cutlass_(
+    double &runtime,
+    Options const &options,
+    library::Operation const *operation,
+    void *arguments,
+    void *host_workspace,
+    void *device_workspace);
+  
+  /// Initialize reduction problem dimenstions and library::Operation
+  bool initialize_reduction_configuration_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::ConvDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against host reference
+  bool verify_with_host_reference_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against device reference
+  bool verify_with_device_reference_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+#if CUTLASS_ENABLE_CUDNN
+
+  /// Verifies CUTLASS against cudnn reference
+  bool verify_with_cudnn_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+#endif //#if CUTLASS_ENABLE_CUDNN
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/profiler/src/cudnn_helpers.cpp b/tools/profiler/src/cudnn_helpers.cpp
new file mode 100644
index 0000000000..86f18095bf
--- /dev/null
+++ b/tools/profiler/src/cudnn_helpers.cpp
@@ -0,0 +1,485 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Helper functions for mapping CUTLASS concepts to cuDNN.
+*/
+#if CUTLASS_ENABLE_CUDNN
+
+#include <stdexcept>
+
+#include "cudnn_helpers.h"
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Converts a cuDNN status to cutlass::Status
+Status get_cutlass_status(cudnnStatus_t cudnn_status) {
+
+  if (cudnn_status == CUDNN_STATUS_SUCCESS) {
+    return Status::kSuccess;
+  }
+  else if (cudnn_status == CUDNN_STATUS_INVALID_VALUE) {
+    return Status::kErrorInvalidProblem;
+  }
+  if (cudnn_status == CUDNN_STATUS_NOT_SUPPORTED) {
+    return Status::kErrorNotSupported;
+  }
+  return Status::kErrorInternal;
+}
+
+/// Converts a cuDNN status to cutlass::profiler::Disposition
+Disposition get_cutlass_disposition(cudnnStatus_t cudnn_status) {
+
+  if (cudnn_status == CUDNN_STATUS_INVALID_VALUE) {
+    return Disposition::kInvalidProblem;
+  }
+  else if (cudnn_status == CUDNN_STATUS_NOT_SUPPORTED) {
+    return Disposition::kNotSupported;
+  }
+  return Disposition::kFailed;
+}
+
+/// Checks cudnnStatus_t converts to cutlas status and returns if Status::kSuccess o.w. throws exception
+Status checkCudnnErr(cudnnStatus_t cudnn_status) {
+  Status cutlass_status = get_cutlass_status(cudnn_status);
+  if(cutlass_status != Status::kSuccess) {
+    throw std::runtime_error("checkCudnnErr failed");
+  }
+  return cutlass_status;
+}
+
+/// Maps a CUTLASS conv mode to a cuDNN cudnnConvolutionMode_t
+bool get_cudnn_conv_mode(cudnnConvolutionMode_t &cudnn_conv_mode, conv::Mode conv_mode) {
+  switch (conv_mode) {
+    case conv::Mode::kCrossCorrelation:
+      cudnn_conv_mode = CUDNN_CROSS_CORRELATION;
+      return true;
+    case conv::Mode::kConvolution:
+      cudnn_conv_mode = CUDNN_CONVOLUTION;
+      return true;
+    default: break;
+  }
+  return false;
+}
+
+/// Maps a CUTLASS tensor layout to a cuDNN cudnnTensorFormat_t
+bool get_cudnn_layout(cudnnTensorFormat_t &cudnn_layout, library::LayoutTypeID layout) {
+  switch (layout) {
+    // cudnn uses the same enum for TensorNC*HW along nDim (ConvDescription::conv_dim)
+    case library::LayoutTypeID::kTensorNCHW:
+    case library::LayoutTypeID::kTensorNCDHW:
+      cudnn_layout = CUDNN_TENSOR_NCHW;
+      return true;
+    case library::LayoutTypeID::kTensorNHWC:
+    case library::LayoutTypeID::kTensorNDHWC:
+      cudnn_layout = CUDNN_TENSOR_NHWC;
+      return true;
+    default: break;
+  }
+  return false;
+}
+
+/// Maps a CUTLASS numeric type to a cuDNN cudnnDataType_t
+bool get_cudnn_datatype(cudnnDataType_t &cudnn_element_type, library::NumericTypeID element_type) {
+  switch (element_type) {
+    case library::NumericTypeID::kF16:
+      cudnn_element_type = CUDNN_DATA_HALF;
+      return true;
+
+    case library::NumericTypeID::kF32:
+      cudnn_element_type = CUDNN_DATA_FLOAT;
+      return true;
+    
+    case library::NumericTypeID::kF64: 
+      cudnn_element_type = CUDNN_DATA_DOUBLE;
+      return true;
+    
+    case library::NumericTypeID::kS2: 
+      break;
+  
+    case library::NumericTypeID::kS4: 
+      break;
+  
+    case library::NumericTypeID::kS8: 
+      cudnn_element_type = CUDNN_DATA_INT8;
+      return true;
+
+    case library::NumericTypeID::kS16: 
+      break;
+ 
+    case library::NumericTypeID::kS32: 
+      cudnn_element_type = CUDNN_DATA_INT32;
+      return true;
+
+    case library::NumericTypeID::kS64: 
+      break;
+
+    case library::NumericTypeID::kU2: 
+      break;
+  
+    case library::NumericTypeID::kU4: 
+      break;
+  
+    case library::NumericTypeID::kU8: 
+      cudnn_element_type = CUDNN_DATA_UINT8;
+      return true;
+
+    case library::NumericTypeID::kU16: 
+      break;
+    
+    case library::NumericTypeID::kU32: 
+      break;
+    
+    case library::NumericTypeID::kU64: 
+      break;
+
+    case library::NumericTypeID::kB1: 
+      break;
+  
+    case library::NumericTypeID::kInvalid:
+  
+    default: 
+      break;
+  }
+
+  return false;
+}
+
+/// Maps CUTLASS math OpcodeClassID and MathOperationID to cuDNN math_type
+bool get_cudnn_mathtype(cudnnMathType_t &cudnn_math_type, library::ConvDescription const &conv_desc) {
+
+  switch (conv_desc.tile_description.math_instruction.opcode_class) {
+
+    case library::OpcodeClassID::kTensorOp:
+    {
+      cudnn_math_type = CUDNN_TENSOR_OP_MATH;
+
+      library::MathOperationID math_op = conv_desc.tile_description.math_instruction.math_operation;
+      
+      // Allow conversion on input data type for fast math operations
+      if (math_op == library::MathOperationID::kMultiplyAddFastF16 || 
+        math_op == library::MathOperationID::kMultiplyAddFastBF16) 
+      {
+        cudnn_math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+      }
+
+      return true;
+    }
+    case library::OpcodeClassID::kSimt:
+      return false;
+  }
+
+  return false;
+}
+
+/// Cudnn compute type seems to be hardcoded to float (To handle a possible cudnn issue)
+float cast_cudnn_compute_type_to_float(library::NumericTypeID type, void const * src) {
+
+  switch (type) {
+    case library::NumericTypeID::kF16:
+    {
+      return float(*(static_cast<half_t const*>(src)));
+    }
+    case library::NumericTypeID::kF32:
+    {
+      return float(*(static_cast<float const*>(src)));
+    }
+    case library::NumericTypeID::kS32:
+    {
+      return float(*(static_cast<int const*>(src)));
+    }
+    default:
+      throw std::runtime_error("Data type handled in cast_compute_type_to_float");
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns a status if cuDNN can satisfy a particular Conv2d description
+Status cudnn_satisfies(
+  library::ConvDescription const &desc, 
+  library::Conv2dConfiguration const &configuration) {
+  
+  auto const &a_tensor = desc.A;
+  auto const &b_tensor = desc.B;
+  auto const &c_tensor = desc.C;
+  auto const &math_instruction = desc.tile_description.math_instruction;
+
+  if(a_tensor.element != b_tensor.element) {
+    return Status::kErrorInvalidDataType;
+  }
+
+  ////////////////////////  Convolution output dimensions p and q ///////////////////////
+  // Cutlass convolutions support arbitrary output dimensions and not constriant by    //
+  // input, filter, padding, striding, dilation sizes.                                 //
+  // cuDNN sets the output dimensions (p, q) using following equations:                //
+  //                                                                                   //
+  // output = div_up(input + 2 * pad - ((filter - 1) * dilation + 1) + 1, stride)      //
+  // where; div_up(a, b) : (a - 1)/b + 1                                               //
+  //                                                                                   //
+  // Before launching cudnn verification or profiling check that output p and q        //
+  // dimensions are cuDNN compliant.                                                   //
+  //                                                                                   //
+  // If user sets output p and q which do not follow above constraints, cutlass conv,  //
+  // host reference, device reference can run. However, cudnn convolution returns      //
+  // "Invalid problem"                                                                 //
+  //                                                                                   //
+  ///////////////////////////////////////////////////////////////////////////////////////
+
+  // check conv output dimension p for cudnn
+  int cudnn_output_p = 
+  (
+    (
+      configuration.problem_size.H + 
+      2 * configuration.problem_size.pad_h - 
+      ((configuration.problem_size.R - 1) * 
+      configuration.problem_size.dilation_h + 1)
+    ) / 
+    (configuration.problem_size.stride_h) 
+    + 1
+  );
+
+  if (cudnn_output_p != configuration.problem_size.P) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  // check conv output dimension q for cudnn
+  int cudnn_output_q = 
+  (
+    (
+      configuration.problem_size.W + 
+      2 * configuration.problem_size.pad_w - 
+      ((configuration.problem_size.S - 1) * 
+      configuration.problem_size.dilation_w + 1)
+    ) / 
+    (configuration.problem_size.stride_w) 
+    + 1
+  );
+
+  if (cudnn_output_q != configuration.problem_size.Q) {
+    return Status::kErrorInvalidProblem;
+  }
+  //////////////////////////////////////////////////////////////////////////////////////
+
+  // conv operator with input=FP16, accumulator=FP32, output=FP32 datatype 
+  if (a_tensor.element ==  library::NumericTypeID::kF16 && 
+      b_tensor.element ==  library::NumericTypeID::kF16 &&
+      math_instruction.element_accumulator == library::NumericTypeID::kF32 &&
+      c_tensor.element == library::NumericTypeID::kF32
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  if (a_tensor.element ==  library::NumericTypeID::kBF16 || 
+      b_tensor.element ==  library::NumericTypeID::kBF16 ||
+      c_tensor.element == library::NumericTypeID::kBF16
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  // TF32 input not supported in cuDNN
+  if (a_tensor.element ==  library::NumericTypeID::kTF32 || 
+      b_tensor.element ==  library::NumericTypeID::kTF32 ||
+      c_tensor.element == library::NumericTypeID::kTF32
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  if (a_tensor.element ==  library::NumericTypeID::kS8 || 
+      b_tensor.element ==  library::NumericTypeID::kS8 ||
+      c_tensor.element == library::NumericTypeID::kS8
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  if (a_tensor.element ==  library::NumericTypeID::kU8 || 
+      b_tensor.element ==  library::NumericTypeID::kU8 ||
+      c_tensor.element == library::NumericTypeID::kU8
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  if (a_tensor.element ==  library::NumericTypeID::kS4 || 
+      b_tensor.element ==  library::NumericTypeID::kS4 ||
+      c_tensor.element == library::NumericTypeID::kS4
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  if (a_tensor.element ==  library::NumericTypeID::kU4 || 
+      b_tensor.element ==  library::NumericTypeID::kU4 ||
+      c_tensor.element == library::NumericTypeID::kU4
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  return Status::kSuccess;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns a status if cuDNN can satisfy a particular Conv3d description
+Status cudnn_satisfies(
+  library::ConvDescription const &desc, 
+  library::Conv3dConfiguration const &configuration) {
+  
+  auto const &a_tensor = desc.A;
+  auto const &b_tensor = desc.B;
+  auto const &c_tensor = desc.C;
+  auto const &math_instruction = desc.tile_description.math_instruction;
+
+  if(a_tensor.element != b_tensor.element) {
+    return Status::kErrorInvalidDataType;
+  }
+
+  ////////////////////////  Convolution output dimensions p and q ///////////////////////
+  // Cutlass convolutions support arbitrary output dimensions and not constriant by    //
+  // input, filter, padding, striding, dilation sizes.                                 //
+  // cuDNN sets the output dimensions (p, q) using following equations:                //
+  //                                                                                   //
+  // output = div_up(input + 2 * pad - ((filter - 1) * dilation + 1) + 1, stride)      //
+  // where; div_up(a, b) : (a - 1)/b + 1                                               //
+  //                                                                                   //
+  // Before launching cudnn verification or profiling check that output p and q        //
+  // dimensions are cuDNN compliant.                                                   //
+  //                                                                                   //
+  // If user sets output p and q which do not follow above constraints, cutlass conv,  //
+  // host reference, device reference can run. However, cudnn convolution returns      //
+  // "Invalid problem"                                                                 //
+  //                                                                                   //
+  ///////////////////////////////////////////////////////////////////////////////////////
+
+  // check conv output dimension z for cudnn
+  int cudnn_output_z = 
+  (
+    (
+      configuration.problem_size.D + 
+      2 * configuration.problem_size.pad_d - 
+      ((configuration.problem_size.T - 1) * 
+      configuration.problem_size.dilation_d + 1)
+    ) / 
+    (configuration.problem_size.stride_d) 
+    + 1
+  );
+
+  if (cudnn_output_z != configuration.problem_size.Z) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  // check conv output dimension p for cudnn
+  int cudnn_output_p = 
+  (
+    (
+      configuration.problem_size.H + 
+      2 * configuration.problem_size.pad_h - 
+      ((configuration.problem_size.R - 1) * 
+      configuration.problem_size.dilation_h + 1)
+    ) / 
+    (configuration.problem_size.stride_h) 
+    + 1
+  );
+
+  if (cudnn_output_p != configuration.problem_size.P) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  // check conv output dimension q for cudnn
+  int cudnn_output_q = 
+  (
+    (
+      configuration.problem_size.W + 
+      2 * configuration.problem_size.pad_w - 
+      ((configuration.problem_size.S - 1) * 
+      configuration.problem_size.dilation_w + 1)
+    ) / 
+    (configuration.problem_size.stride_w) 
+    + 1
+  );
+
+  if (cudnn_output_q != configuration.problem_size.Q) {
+    return Status::kErrorInvalidProblem;
+  }
+  //////////////////////////////////////////////////////////////////////////////////////
+
+  // conv operator with input, accumulator, output datatype of (hss) are not supported 
+  // in cuDNN
+  if (a_tensor.element ==  library::NumericTypeID::kF16 && 
+      b_tensor.element ==  library::NumericTypeID::kF16 &&
+      math_instruction.element_accumulator == library::NumericTypeID::kF32 &&
+      c_tensor.element == library::NumericTypeID::kF32
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  if (a_tensor.element ==  library::NumericTypeID::kBF16 || 
+      b_tensor.element ==  library::NumericTypeID::kBF16 ||
+      c_tensor.element == library::NumericTypeID::kBF16
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  if (a_tensor.element ==  library::NumericTypeID::kTF32 || 
+      b_tensor.element ==  library::NumericTypeID::kTF32 ||
+      c_tensor.element == library::NumericTypeID::kTF32
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  if (a_tensor.element ==  library::NumericTypeID::kS8 || 
+      b_tensor.element ==  library::NumericTypeID::kS8 ||
+      c_tensor.element == library::NumericTypeID::kS8
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  // S4 not supported in cuDNN 
+  if (a_tensor.element ==  library::NumericTypeID::kS4 || 
+      b_tensor.element ==  library::NumericTypeID::kS4 ||
+      c_tensor.element == library::NumericTypeID::kS4
+      ) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  return Status::kSuccess;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+#endif
diff --git a/tools/profiler/src/cudnn_helpers.h b/tools/profiler/src/cudnn_helpers.h
new file mode 100644
index 0000000000..58fe4e678f
--- /dev/null
+++ b/tools/profiler/src/cudnn_helpers.h
@@ -0,0 +1,584 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Helper functions for mapping CUTLASS concepts to cuDNN.
+
+*/
+
+#pragma once
+#if CUTLASS_ENABLE_CUDNN
+#include <cuda_runtime.h>
+#include <cudnn.h>
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/library/library.h"
+#include "enumerated_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Converts a cuDNN status to cutlass::Status
+Status get_cutlass_status(cudnnStatus_t cudnn_status);
+
+/// Converts a cuDNN status to cutlass::profiler::Disposition
+Disposition get_cutlass_disposition(cudnnStatus_t cudnn_status);
+
+/// Checks cudnnStatus_t converts to cutlas status and returns if Status::kSuccess o.w. throws exception
+Status checkCudnnErr(cudnnStatus_t cudnn_status);
+
+/// Maps a CUTLASS conv mode to a cuDNN conv mode enumeration
+bool get_cudnn_conv_mode(cudnnConvolutionMode_t &cudnn_conv_mode, conv::Mode conv_mode);
+
+/// Maps a CUTLASS layout type to a cuDNN data type enumeration
+bool get_cudnn_layout(cudnnTensorFormat_t &cudnn_layout, library::LayoutTypeID layout);
+
+/// Maps a CUTLASS numeric type to a cuDNN data type enumeration
+bool get_cudnn_datatype(cudnnDataType_t &cudnn_element_type, library::NumericTypeID element_type);
+
+/// Maps CUTLASS math OpcodeClassID and MathOperationID to cuDNN math_type
+bool get_cudnn_mathtype(cudnnMathType_t &cudnn_math_type, library::ConvDescription const &conv_desc);
+
+/// Returns a status if cudnn can satisfy a particular Conv2d description
+Status cudnn_satisfies(library::ConvDescription const &desc, library::Conv2dConfiguration const &configuration);
+
+/// Returns a status if cudnn can satisfy a particular Conv3d description
+Status cudnn_satisfies(library::ConvDescription const &desc, library::Conv3dConfiguration const &configuration);
+
+/// Cudnn compute type seems to be hardcoded to float (To handle a possible cudnn issue)
+float cast_cudnn_compute_type_to_float(library::NumericTypeID type, void const * src);
+
+
+/// This is a helper class to create cudnnHandle_t automatically on CudnnCreate object creation and 
+/// to destroy cudnnHandle_t on CudnnCreate object destruction. 
+/// Additionaly, it provides implicit cast from CudnnCreate's object to cudnnHandle_t's object
+class CudnnCreate {
+private:
+	cudnnHandle_t handle;
+	cudnnStatus_t status;
+
+public:
+	CudnnCreate() {
+		status = cudnnCreate(&handle);
+	}
+
+	~CudnnCreate() {
+		cudnnDestroy(handle);
+	}
+
+    /// Implicit cast CudnnCreate object to cudnnHandle_t
+    operator cudnnHandle_t() const { return handle; }
+
+    /// returns cudnnStatus_t for handle creation
+    cudnnStatus_t get_cudnn_create_status() { return status; }
+};
+
+
+namespace detail {
+
+/// Dispatcher to cudnn convolution operators
+struct cudnnConvDispatcher {
+
+  //
+  // Data members
+  //
+  //library::Conv2dConfiguration configuration;
+  library::ConvArguments arguments;
+  library::ConvKind conv_kind;
+
+  // cudnn-specific data structures to fill cudnn API call arguments
+  // cudnn activation, filter, and output descriptors
+  cudnnTensorDescriptor_t activation_desc;
+  cudnnFilterDescriptor_t filter_desc;
+  cudnnTensorDescriptor_t output_desc;
+  cudnnConvolutionDescriptor_t conv_desc;
+
+  // cudnn datatypes
+  cudnnDataType_t data_type_activation;
+  cudnnDataType_t data_type_filter;
+  cudnnDataType_t data_type_output;
+
+  // cudnn layouts
+  cudnnTensorFormat_t layout_activation;
+  cudnnTensorFormat_t layout_filter;
+  cudnnTensorFormat_t layout_output;
+
+  // cudnn convolution mode
+  cudnnConvolutionMode_t conv_mode;
+  
+  // cudnn math type (tensorop, tensorop with conversion, simt)
+  cudnnMathType_t math_type;
+
+  // cudnn compute data type
+  cudnnDataType_t compute_type;
+  
+  // cudnn compute type seems to be hardcoded to float (to handle a possible a cudnn issue)
+  float alpha;
+  float beta;
+
+  // cudnn workspace
+  size_t workspace_size_in_bytes = 0;
+  cutlass::device_memory::allocation<char> workspace;
+  
+  // select cudnn's implicit gemm precomputed algorithm with tensor operations
+  static cudnnConvolutionFwdAlgo_t const fprop_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+  static cudnnConvolutionBwdDataAlgo_t const dgrad_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+  static cudnnConvolutionBwdFilterAlgo_t const wgrad_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+
+  Status status;
+  
+  //
+  // Methods
+  //
+
+  // TODO: unify ctor cudnnConvDispatcher for conv2d and conv3d by unifying Conv2dConfigration
+  
+  // ctor for conv2d 
+  cudnnConvDispatcher( 
+    library::ConvDescription const &op_desc,
+    library::Conv2dConfiguration configuration,
+    library::ConvArguments arguments_,
+    cudnnHandle_t handle
+  ):
+    //configuration(configuration_), 
+    arguments(arguments_),
+    conv_kind(op_desc.conv_kind), 
+    status(Status::kSuccess) {
+
+    bool good = true;
+
+    // Get cudnn datatype, layout, and convolution mode from library::ConvDescription
+    good = (good && get_cudnn_datatype(data_type_activation, op_desc.A.element));
+    good = (good && get_cudnn_datatype(data_type_filter, op_desc.B.element));
+    good = (good && get_cudnn_datatype(data_type_output, op_desc.C.element));
+    good = (good && get_cudnn_layout(layout_activation, op_desc.A.layout));
+    good = (good && get_cudnn_layout(layout_filter, op_desc.B.layout));
+    good = (good && get_cudnn_layout(layout_output, op_desc.C.layout));
+    good = (good && get_cudnn_conv_mode(conv_mode, configuration.problem_size.mode));
+    // Get cudnn mathtype (cudnnMathType_t)
+    good = (good && get_cudnn_mathtype(math_type, op_desc));
+    good = (good && get_cudnn_datatype(
+      compute_type,
+      op_desc.tile_description.math_instruction.element_accumulator));
+    // Check cutlass Conv2d description has equivalent operator in cudnn
+    if (!good) {
+      status = Status::kErrorNotSupported;
+      return;
+    }
+    // cudnn compute type seems to be hardcoded to float (to handle a possible a cudnn issue)
+    alpha = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.alpha);
+    beta = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.beta);
+
+    // Create convolution descriptor object
+    status = get_cutlass_status(cudnnCreateConvolutionDescriptor(&conv_desc));
+
+    // Configure convolution operator
+    std::vector<int> padding {configuration.problem_size.pad_h, configuration.problem_size.pad_w};
+    std::vector<int> stride {configuration.problem_size.stride_h, configuration.problem_size.stride_w};
+    std::vector<int> dilation {configuration.problem_size.dilation_h, configuration.problem_size.dilation_w};
+
+    status = get_cutlass_status(
+      cudnnSetConvolutionNdDescriptor(
+        conv_desc,
+        op_desc.conv_dim,
+        padding.data(),
+        stride.data(),
+        dilation.data(),
+        conv_mode,
+        compute_type
+    ));
+
+    // Set groups
+    status = get_cutlass_status(cudnnSetConvolutionGroupCount(conv_desc, configuration.problem_size.groups));
+
+    // Create activation, filter, and output descriptor objects
+    status = get_cutlass_status(cudnnCreateTensorDescriptor(&activation_desc));
+    status = get_cutlass_status(cudnnCreateFilterDescriptor(&filter_desc));
+    status = get_cutlass_status(cudnnCreateTensorDescriptor(&output_desc));
+
+    // Set activation, filter, and output descriptor 
+    status = get_cutlass_status(
+      cudnnSetTensor4dDescriptor(
+        activation_desc,
+        layout_activation,
+        data_type_activation,
+        configuration.problem_size.N,
+        configuration.problem_size.C,
+        configuration.problem_size.H,
+        configuration.problem_size.W 
+    ));
+
+    status = get_cutlass_status(
+      cudnnSetFilter4dDescriptor(
+        filter_desc,
+        data_type_filter,
+        layout_filter,
+        configuration.problem_size.K,
+        configuration.problem_size.C,
+        configuration.problem_size.R,
+        configuration.problem_size.S
+    ));
+
+    status = get_cutlass_status(
+      cudnnSetTensor4dDescriptor(
+        output_desc,
+        layout_output,
+        data_type_output,
+        configuration.problem_size.N,
+        configuration.problem_size.K,
+        configuration.problem_size.P,
+        configuration.problem_size.Q
+    ));
+
+    // Set math instruction to tensor op
+    status = get_cutlass_status(
+      cudnnSetConvolutionMathType(conv_desc, math_type));
+
+    // Initialize workspace
+    switch (conv_kind) {
+      case library::ConvKind::kFprop:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionForwardWorkspaceSize(
+            handle,
+            activation_desc,
+            filter_desc,
+            conv_desc,
+            output_desc,
+            fprop_algo,
+            &workspace_size_in_bytes
+        )); break;
+      case library::ConvKind::kDgrad:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionBackwardDataWorkspaceSize(
+            handle,
+            filter_desc,
+            output_desc,
+            conv_desc,
+            activation_desc,
+            dgrad_algo,
+            &workspace_size_in_bytes
+        )); break;
+        case library::ConvKind::kWgrad:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            handle,
+            activation_desc,
+            output_desc,
+            conv_desc,
+            filter_desc,
+            wgrad_algo,
+            &workspace_size_in_bytes
+        )); break;
+
+    }
+
+    workspace = cutlass::device_memory::allocation<char>(workspace_size_in_bytes);
+  }
+
+
+  // ctor for conv3d 
+  cudnnConvDispatcher( 
+    library::ConvDescription const &op_desc,
+    library::Conv3dConfiguration configuration,
+    library::ConvArguments arguments_,
+    cudnnHandle_t handle
+  ):
+    //configuration(configuration_), 
+    arguments(arguments_),
+    conv_kind(op_desc.conv_kind), 
+    status(Status::kSuccess) {
+
+    bool good = true;
+
+    // Get cudnn datatype, layout, and convolution mode from library::ConvDescription
+    good = (good && get_cudnn_datatype(data_type_activation, op_desc.A.element));
+    good = (good && get_cudnn_datatype(data_type_filter, op_desc.B.element));
+    good = (good && get_cudnn_datatype(data_type_output, op_desc.C.element));
+
+    good = (good && get_cudnn_layout(layout_activation, op_desc.A.layout));
+    good = (good && get_cudnn_layout(layout_filter, op_desc.B.layout));
+    good = (good && get_cudnn_layout(layout_output, op_desc.C.layout));
+
+    good = (good && get_cudnn_conv_mode(conv_mode, configuration.problem_size.mode));
+    
+    // cudnn compute type seems to be hardcoded to float (to handle a possible a cudnn issue)
+    alpha = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.alpha);
+    beta = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.beta);
+
+    good = (good && get_cudnn_datatype(
+      compute_type, 
+      op_desc.tile_description.math_instruction.element_accumulator));
+
+    // Check cutlass Conv2d description has equivalent operator in cudnn
+    if (!good) {
+      status = Status::kErrorNotSupported;
+    }
+
+    // Create convolution descriptor object
+    status = get_cutlass_status(cudnnCreateConvolutionDescriptor(&conv_desc));
+
+    // Configure convolution operator
+    std::vector<int> padding {configuration.problem_size.pad_d, configuration.problem_size.pad_h, configuration.problem_size.pad_w};
+    std::vector<int> stride {configuration.problem_size.stride_d, configuration.problem_size.stride_h, configuration.problem_size.stride_w};
+    std::vector<int> dilation {configuration.problem_size.dilation_d, configuration.problem_size.dilation_h, configuration.problem_size.dilation_w};
+
+    status = get_cutlass_status(
+      cudnnSetConvolutionNdDescriptor(
+        conv_desc,
+        op_desc.conv_dim,
+        padding.data(),
+        stride.data(),
+        dilation.data(),
+        conv_mode,
+        compute_type
+    ));
+
+    // Set groups
+    status = get_cutlass_status(cudnnSetConvolutionGroupCount(conv_desc, configuration.problem_size.groups));
+
+    // Create activation, filter, and output descriptor objects
+    status = get_cutlass_status(cudnnCreateTensorDescriptor(&activation_desc));
+    status = get_cutlass_status(cudnnCreateFilterDescriptor(&filter_desc));
+    status = get_cutlass_status(cudnnCreateTensorDescriptor(&output_desc));
+
+    // Set activation descriptor 
+    std::vector<int> activation_extent {
+      configuration.problem_size.N,
+      configuration.problem_size.C,
+      configuration.problem_size.D,
+      configuration.problem_size.H,
+      configuration.problem_size.W
+    };
+
+    std::vector<int> activation_stride {
+      configuration.layout_activations.stride()[3],
+      1,
+      configuration.layout_activations.stride()[2],
+      configuration.layout_activations.stride()[1],
+      configuration.layout_activations.stride()[0]
+    };
+
+    status = get_cutlass_status(
+      cudnnSetTensorNdDescriptor(
+        activation_desc,
+        data_type_activation,
+        op_desc.conv_dim + 2,
+        activation_extent.data(),
+        activation_stride.data()        
+    ));
+
+    // Set filter descriptor
+    std::vector<int> filter_extent {
+      configuration.problem_size.K,
+      configuration.problem_size.C,
+      configuration.problem_size.T,
+      configuration.problem_size.R,
+      configuration.problem_size.S
+    };
+
+    std::vector<int> filter_stride {
+      configuration.layout_filters.stride()[3],
+      1,
+      configuration.layout_filters.stride()[2],
+      configuration.layout_filters.stride()[1],
+      configuration.layout_filters.stride()[0]
+    };
+
+    status = get_cutlass_status(
+      cudnnSetFilterNdDescriptor(
+        filter_desc,
+        data_type_filter,
+        layout_filter,
+        op_desc.conv_dim + 2,
+        filter_extent.data() 
+    ));
+
+
+    // Set output descriptor
+    std::vector<int> output_extent {
+      configuration.problem_size.N,
+      configuration.problem_size.K,
+      configuration.problem_size.Z,
+      configuration.problem_size.P,
+      configuration.problem_size.Q
+    };
+
+    std::vector<int> output_stride {
+      configuration.layout_output.stride()[3],
+      1,
+      configuration.layout_output.stride()[2],
+      configuration.layout_output.stride()[1],
+      configuration.layout_output.stride()[0]
+    };
+
+    status = get_cutlass_status(
+      cudnnSetTensorNdDescriptor(
+        output_desc,
+        data_type_output,
+        op_desc.conv_dim + 2,
+        output_extent.data(),
+        output_stride.data() 
+    ));
+
+    // Set math instruction to tensor op
+    status = get_cutlass_status(
+      cudnnSetConvolutionMathType(conv_desc, math_type));
+
+    // Initialize workspace
+    switch (conv_kind) {
+      case library::ConvKind::kFprop:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionForwardWorkspaceSize(
+            handle,
+            activation_desc,
+            filter_desc,
+            conv_desc,
+            output_desc,
+            fprop_algo,
+            &workspace_size_in_bytes
+        )); break;
+      case library::ConvKind::kDgrad:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionBackwardDataWorkspaceSize(
+            handle,
+            filter_desc,
+            output_desc,
+            conv_desc,
+            activation_desc,
+            dgrad_algo,
+            &workspace_size_in_bytes
+        )); break;
+        case library::ConvKind::kWgrad:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            handle,
+            activation_desc,
+            output_desc,
+            conv_desc,
+            filter_desc,
+            wgrad_algo,
+            &workspace_size_in_bytes
+        )); break;
+
+    }
+
+    workspace = cutlass::device_memory::allocation<char>(workspace_size_in_bytes);
+  }
+
+  /// Executes Conv2d operater from cudnn library
+  cudnnStatus_t operator()(cudnnHandle_t handle) {
+
+    switch (conv_kind) {
+      case library::ConvKind::kFprop:
+        return cudnnConvolutionForward(
+          handle,
+          &alpha,
+          activation_desc,
+          activation(),
+          filter_desc,
+          filter(),
+          conv_desc,
+          fprop_algo,
+          workspace.get(),
+          workspace_size_in_bytes,
+          &beta,
+          output_desc,
+          arguments.D
+        );
+      case library::ConvKind::kDgrad:
+        return cudnnConvolutionBackwardData(
+          handle,
+          &alpha,
+          filter_desc,
+          filter(),
+          output_desc,
+          output(),
+          conv_desc,
+          dgrad_algo,
+          workspace.get(),
+          workspace_size_in_bytes,
+          &beta,
+          activation_desc,
+          arguments.D
+        );
+      case library::ConvKind::kWgrad:
+        return cudnnConvolutionBackwardFilter(
+          handle,
+          &alpha,
+          activation_desc,
+          activation(),
+          output_desc,
+          output(),
+          conv_desc,
+          wgrad_algo,
+          workspace.get(),
+          workspace_size_in_bytes,
+          &beta,
+          filter_desc,
+          arguments.D
+        );
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Actviation Tensor
+  void const * activation() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return arguments.A;
+      case library::ConvKind::kDgrad : return arguments.C;
+      case library::ConvKind::kWgrad : return arguments.B;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Filter Tensor
+  void const *filter() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return arguments.B;
+      case library::ConvKind::kDgrad : return arguments.B;
+      case library::ConvKind::kWgrad : return arguments.C;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Output Tensor
+  void const *output() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return arguments.C;
+      case library::ConvKind::kDgrad : return arguments.A;
+      case library::ConvKind::kWgrad : return arguments.A;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+};
+
+} // namespace detail
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif //#if CUTLASS_ENABLE_CUDNN
+} // namespace profiler
+} // namespace cutlass
diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu
index 9934ff4cd6..c1e33ad61e 100644
--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@@ -32,6 +32,8 @@
 // Profiler includes
 #include "cutlass_profiler.h"
 #include "gemm_operation_profiler.h"
+#include "conv2d_operation_profiler.h"          
+#include "conv3d_operation_profiler.h"          
 #include "sparse_gemm_operation_profiler.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -50,6 +52,10 @@ CutlassProfiler::CutlassProfiler(
 
   operation_profilers_.emplace_back(new SparseGemmOperationProfiler(options));
 
+  operation_profilers_.emplace_back(new Conv2dOperationProfiler(options));
+
+  operation_profilers_.emplace_back(new Conv3dOperationProfiler(options));
+
 }
 
 CutlassProfiler::~CutlassProfiler() {
@@ -159,6 +165,8 @@ void CutlassProfiler::print_usage_(std::ostream &out) {
 
   out << "\n\nFor details about a particular function, specify the function name with --help.\n\nExample:\n\n"
     << "  $ cutlass_profiler --operation=Gemm --help\n\n"
+    << "  $ cutlass_profiler --operation=Conv3d --help\n\n"         
+    << "  $ cutlass_profiler --operation=Conv2d --help\n\n"         
   ;
 }
 
diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu
index 777fb4d0aa..247bcccf15 100644
--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@@ -133,7 +133,18 @@ std::vector<int> DeviceAllocation::get_packed_layout(
     case library::LayoutTypeID::kTensorNDHWC:
       stride = get_packed_layout_stride<cutlass::layout::TensorNDHWC>(extent);
       break;
-
+    case library::LayoutTypeID::kTensorNC32HW32:
+      stride = get_packed_layout_stride<cutlass::layout::TensorNCxHWx<32>>(extent);
+      break;
+    case library::LayoutTypeID::kTensorNC64HW64:
+      stride = get_packed_layout_stride<cutlass::layout::TensorNCxHWx<64>>(extent);
+      break;
+    case library::LayoutTypeID::kTensorC32RSK32:
+      stride = get_packed_layout_stride<cutlass::layout::TensorCxRSKx<32>>(extent);
+      break;
+    case library::LayoutTypeID::kTensorC64RSK64:
+      stride = get_packed_layout_stride<cutlass::layout::TensorCxRSKx<64>>(extent);
+      break;
     default: break;
   }
 
@@ -247,6 +258,18 @@ size_t DeviceAllocation::construct_layout(
     case library::LayoutTypeID::kTensorNDHWC:
       return construct_layout_<cutlass::layout::TensorNDHWC>(bytes, layout_id, extent, stride);
 
+    case library::LayoutTypeID::kTensorNC32HW32:
+      return construct_layout_<cutlass::layout::TensorNCxHWx<32>>(bytes, layout_id, extent, stride);
+
+    case library::LayoutTypeID::kTensorNC64HW64:
+      return construct_layout_<cutlass::layout::TensorNCxHWx<64>>(bytes, layout_id, extent, stride);
+
+    case library::LayoutTypeID::kTensorC32RSK32:
+      return construct_layout_<cutlass::layout::TensorCxRSKx<32>>(bytes, layout_id, extent, stride);
+
+    case library::LayoutTypeID::kTensorC64RSK64:
+      return construct_layout_<cutlass::layout::TensorCxRSKx<64>>(bytes, layout_id, extent, stride);
+
     default: break;
   }
 
@@ -1362,6 +1385,18 @@ static void write_tensor_csv_static_type(
     case library::LayoutTypeID::kTensorNDHWC:
       write_tensor_csv_static_tensor_view<T, layout::TensorNDHWC>(out, allocation);
       break;
+    case library::LayoutTypeID::kTensorNC32HW32:
+      write_tensor_csv_static_tensor_view<T, layout::TensorNCxHWx<32>>(out, allocation);
+      break;
+    case library::LayoutTypeID::kTensorNC64HW64:
+      write_tensor_csv_static_tensor_view<T, layout::TensorNCxHWx<64>>(out, allocation);
+      break;
+    case library::LayoutTypeID::kTensorC32RSK32:
+      write_tensor_csv_static_tensor_view<T, layout::TensorCxRSKx<32>>(out, allocation);
+      break;
+    case library::LayoutTypeID::kTensorC64RSK64:
+      write_tensor_csv_static_tensor_view<T, layout::TensorCxRSKx<64>>(out, allocation);
+      break;
     default:
       throw std::runtime_error("Unhandled layout");
   }
diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu
index 2bbf2eeb11..edd6f07ce2 100644
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@@ -243,7 +243,7 @@ int OperationProfiler::profile_all(
   ProblemSpace::Iterator problem_it = problem_space.begin();
   ProblemSpace::Iterator problem_end = problem_space.end();
 
-  bool continue_profiling = true;
+  bool continue_profiling = true, internal_error = false;
 
   // For each problem in problem space
   for (; continue_profiling && problem_it != problem_end; ++problem_it) {
@@ -302,7 +302,8 @@ int OperationProfiler::profile_all(
 
         if (status == Status::kErrorInternal) {
           // Stop profiling if there was an internal error
-          return false;
+          internal_error = true;
+          break;
         }
         else if (status != Status::kSuccess) {
           // If the workspace could not be initialized for any other reason, continue to
@@ -322,7 +323,8 @@ int OperationProfiler::profile_all(
 
           if (status == Status::kErrorInternal) {
             // Stop profiling if there was an internal error
-            return false;
+            internal_error = true;
+            break;
           }
           else if (status != Status::kSuccess) {
             // If the workspace could not be initialized for any other reason, continue to
@@ -336,8 +338,9 @@ int OperationProfiler::profile_all(
         //
 
         // B. Verify CUTLASS
-        if (continue_profiling) {
-          
+         
+        if (continue_profiling && options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
           continue_profiling = this->verify_cutlass(
             options,
             report, 
@@ -368,6 +371,7 @@ int OperationProfiler::profile_all(
         //
         // D. Profile
         //
+
         if (continue_profiling && options.profiling.enabled) {
 
           continue_profiling = this->profile(
@@ -392,10 +396,7 @@ int OperationProfiler::profile_all(
     } 
   }
 
-  // 3. Emit report
-  report.close();
-
-  return 0;
+  return internal_error ? 1 : 0;
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu
index e2d3e131f0..6bac578072 100644
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@@ -401,6 +401,7 @@ Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {
   else {
     providers.push_back(library::Provider::kCUTLASS);
     providers.push_back(library::Provider::kCUBLAS);
+    providers.push_back(library::Provider::kCUDNN);      
   }
 }
 
@@ -428,8 +429,8 @@ void Options::Profiling::print_usage(std::ostream &out) const {
 
     << "  --providers=<providers>                      "
     << "    List of providers to be profiled for performance. (default: '*')" << end_of_line
-    << "      Gemm providers {cutlass*"
-    << "}" << end_of_line
+    << "      Gemm providers {cutlass*, cublas*}" << end_of_line
+    << "      Conv2d providers {cutlass*, cudnn*}"
     << "\n\n";
 
 }
@@ -502,6 +503,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
   else {
     providers.push_back(library::Provider::kCUBLAS);
     providers.push_back(library::Provider::kReferenceDevice);
+    providers.push_back(library::Provider::kCUDNN);      
   }
 }
 
@@ -529,6 +531,7 @@ void Options::Verification::print_usage(std::ostream &out) const {
     << "  --verification-providers=<providers>         "
     << "    List of providers used to verify result. (default: '*')" << end_of_line
     << "      Gemm verification-providers {cublas*}" << end_of_line
+    << "      Conv2d verification-providers {cudnn*, device*, host}"
     << "\n\n";
 }
 
@@ -570,6 +573,7 @@ Options::Report::Report(cutlass::CommandLine const &cmdline) {
   
   cmdline.get_cmd_line_argument("append", append, false);
   cmdline.get_cmd_line_argument("output", output_path);
+  cmdline.get_cmd_line_argument("junit-output", junit_output_path);
 
   if (cmdline.check_cmd_line_flag("tags")) {
     cmdline.get_cmd_line_argument_pairs("tags", pivot_tags);
@@ -591,6 +595,9 @@ void Options::Report::print_usage(std::ostream &out) const {
     << "  --output=<path>                              "
     << "    Path to output file for machine readable results. Operation kind and '.csv' is appended.\n\n"
 
+    << "  --junit-output=<path>                        "
+    << "    Path to junit output file for result reporting. Operation kind and '.junit.xml' is appended.\n\n"
+
     << "  --report-not-run=<bool>                      "
     << "    If true, reports the status of all kernels including those that" << end_of_line
     << "      do not satisfy the given arguments.\n\n"
@@ -608,6 +615,7 @@ void Options::Report::print_options(std::ostream &out, int indent) const {
   out
     << indent_str(indent) << "append: " << append << "\n"
     << indent_str(indent) << "output: " << output_path << "\n"
+    << indent_str(indent) << "junit-output: " << junit_output_path << "\n"
     << indent_str(indent) << "report_not_run: " << report_not_run << "\n"
     << indent_str(indent) << "tags:\n";
 
diff --git a/tools/profiler/src/options.h b/tools/profiler/src/options.h
index 48463efa50..79e0169970 100644
--- a/tools/profiler/src/options.h
+++ b/tools/profiler/src/options.h
@@ -218,6 +218,9 @@ class Options {
     /// Path to a file containing results
     std::string output_path;
 
+    /// Path to a file containing junit xml results
+    std::string junit_output_path;
+
     /// Sequence of tags to attach to each result
     std::vector<std::pair<std::string, std::string>> pivot_tags;
 
diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp
index 07a7edc955..de184eb04b 100644
--- a/tools/profiler/src/performance_report.cpp
+++ b/tools/profiler/src/performance_report.cpp
@@ -69,11 +69,15 @@ PerformanceReport::PerformanceReport(
   options_(options), argument_names_(argument_names), problem_index_(0), good_(true), op_kind_(op_kind) {
 
   // Strip '.csv' if present
-  std::string base_path = options_.report.output_path.substr(
-    0, options_.report.output_path.rfind(".csv"));
-
+  std::string base_path = options_.report.output_path;
+  base_path = base_path.substr(0, base_path.rfind(".csv"));
   op_file_name_ = base_path + "." + to_string(op_kind_) + ".csv";
 
+  base_path = options_.report.junit_output_path;
+  base_path = base_path.substr(0, base_path.rfind(".xml"));
+  base_path = base_path.substr(0, base_path.rfind(".junit"));
+  op_junit_file_name_ = base_path + "." + to_string(op_kind_) + ".junit.xml";
+
   //
   // Open output file for operation of PerformanceReport::op_kind
   //
@@ -108,6 +112,21 @@ PerformanceReport::PerformanceReport(
       print_csv_header_(output_file_) << std::endl;
     }
   }
+
+  if (!options_.report.junit_output_path.empty()) {
+
+    junit_output_file_.open(op_junit_file_name_);
+
+    if (!junit_output_file_.good()) {
+
+      std::cerr << "Could not open junit output file at path '"
+         << options_.report.junit_output_path << "'" << std::endl;
+
+      good_ = false;
+    }
+
+    print_junit_header_(junit_output_file_);
+  }
 }
 
 void PerformanceReport::next_problem() {
@@ -123,6 +142,10 @@ void PerformanceReport::append_result(PerformanceResult result) {
     print_result_pretty_(std::cout, result) << std::flush; 
   }
 
+  if (junit_output_file_.is_open()) {
+    print_junit_result_(junit_output_file_, result);
+  }
+
   if (output_file_.is_open()) {
     print_result_csv_(output_file_, result) << std::endl;
   }
@@ -143,7 +166,7 @@ void PerformanceReport::append_results(PerformanceResultVector const &results) {
   }
 }
 
-void PerformanceReport::close() {
+PerformanceReport::~PerformanceReport() {
 
   //
   // Output results to stdout if they were not written to a file already.
@@ -161,7 +184,17 @@ void PerformanceReport::close() {
     }
   }
   else if (output_file_.is_open() && options_.report.verbose) {
-    std::cout << "\n\nWrote results to '" << op_file_name_ << "'" << std::endl;
+    std::cout << "\nWrote results to '" << op_file_name_ << "'" << std::endl;
+  }
+
+  if (output_file_.is_open()) {
+    output_file_.close();
+  }
+
+  if (junit_output_file_.is_open()) {
+    print_junit_footer_(junit_output_file_);
+    junit_output_file_.close();
+    std::cout << "\nWrote jUnit results to '" << op_junit_file_name_ << "'" << std::endl;
   }
 }
 
@@ -179,7 +212,8 @@ static const char *disposition_status_color(Disposition disposition) {
 /// Prints the result in human readable form
 std::ostream & PerformanceReport::print_result_pretty_(
   std::ostream &out, 
-  PerformanceResult const &result) {
+  PerformanceResult const &result,
+  bool use_shell_coloring) {
 
   out << "=============================\n"
     << "  Problem ID: " << result.problem_index << "\n";
@@ -196,14 +230,20 @@ std::ostream & PerformanceReport::print_result_pretty_(
     out << "\n";
   }
 
+  std::string shell_color_bright = use_shell_coloring ? SHELL_COLOR_BRIGHT() : "";
+  std::string shell_color_end = use_shell_coloring ? SHELL_COLOR_END() : "";
+  auto _disposition_status_color = [&](Disposition d) -> const char * { 
+    return use_shell_coloring ? disposition_status_color(d) : "";
+  };
+
   out
     << "\n"
-    << "        Provider: " << SHELL_COLOR_BRIGHT() << library::to_string(result.provider, true) << SHELL_COLOR_END() << "\n"
-    << "   OperationKind: " << SHELL_COLOR_BRIGHT() << library::to_string(result.op_kind) << SHELL_COLOR_END() << "\n"
+    << "        Provider: " << shell_color_bright << library::to_string(result.provider, true) << shell_color_end << "\n"
+    << "   OperationKind: " << shell_color_bright << library::to_string(result.op_kind) << shell_color_end << "\n"
     << "       Operation: " << result.operation_name << "\n\n"
-    << "          Status: " << SHELL_COLOR_BRIGHT() << library::to_string(result.status, true) << SHELL_COLOR_END() << "\n"
-    << "    Verification: " << SHELL_COLOR_BRIGHT() << (options_.verification.enabled ? "ON":"OFF") << SHELL_COLOR_END() << "\n"
-    << "     Disposition: " << disposition_status_color(result.disposition) << to_string(result.disposition, true) << SHELL_COLOR_END() << "\n\n";
+    << "          Status: " << shell_color_bright << library::to_string(result.status, true) << shell_color_end << "\n"
+    << "    Verification: " << shell_color_bright << (options_.verification.enabled ? "ON":"OFF") << shell_color_end << "\n"
+    << "     Disposition: " << _disposition_status_color(result.disposition) << to_string(result.disposition, true) << shell_color_end << "\n\n";
 
   // Display individual verification results for each verification-provider
   if (options_.verification.enabled) {
@@ -263,10 +303,6 @@ std::ostream & PerformanceReport::print_csv_header_(
     << ",OperationKind,Operation,Disposition,Status";
 
   for (auto const &arg_name : argument_names_) {
-    // Operand E is internal to the sparse kernel
-    if (arg_name.compare("E") == 0)
-      continue;
-
     out << "," << arg_name;
   }
 
@@ -327,6 +363,112 @@ std::ostream & PerformanceReport::print_result_csv_(
   return out;
 }
 
+std::ostream & PerformanceReport::print_junit_header_(std::ostream &out) {
+
+  out << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
+  out << "<testsuite name=\"cutlass_profiler\">" << std::endl;
+  return out;
+
+}
+
+namespace {
+
+  std::string escape_xml_special_chars(const std::string& src) {
+    std::stringstream dst;
+    for (char ch : src) {
+      switch (ch) {
+      case '&': dst << "&amp;"; break;
+      case '\'': dst << "&apos;"; break;
+      case '"': dst << "&quot;"; break;
+      case '<': dst << "&lt;"; break;
+      case '>': dst << "&gt;"; break;
+      default: dst << ch; break;
+      }
+    }
+    return dst.str();
+  }
+
+  template<typename T>
+  std::ostream & print_junit_result_property_(std::ostream & os, const std::string & name, const T & property) {
+    return os << "    <property name=\"" << name << "\" value=\"" << property << "\" />" << std::endl;
+  }
+}
+
+std::ostream & PerformanceReport::print_junit_result_(std::ostream &out, PerformanceResult const &result) {
+
+  out << "  " << "<testcase name=\"";
+
+  std::string delim = "";
+
+  // Pivot tags
+  for (auto const & tag : options_.report.pivot_tags) {
+    out << delim << tag.second; delim = "_";
+  }
+
+  out << delim << to_string(result.op_kind); delim = "_";
+  out << delim << result.operation_name;
+
+  for (auto const & arg : result.arguments) {
+    out << delim << arg.second;
+  }
+
+  out << "\" ";
+
+  bool skipped = false, failed = false, error = false;
+
+  switch (result.disposition) {
+  case Disposition::kNotRun:
+  case Disposition::kNotSupported:
+    skipped = true;
+    break;
+  case Disposition::kPassed: 
+  case Disposition::kNotVerified:
+    break;
+  case Disposition::kFailed: 
+  case Disposition::kIncorrect:
+    failed = true; 
+    break;
+  case Disposition::kInvalidProblem:
+  case Disposition::kInvalid:
+    error = true;
+    break;
+  };
+  
+  if (skipped) {
+    out << "status=\"notrun\"";
+  } else {
+    out << "status=\"run\"";
+  }
+    
+  out << ">" << std::endl;
+
+  if (failed) {
+    out << "    <failure message=\"" << to_string(result.disposition) << "\" />" << std::endl;
+  }
+
+  if (error) {
+    out << "    <error message=\"" << to_string(result.disposition) << "\" />" << std::endl;
+  }
+
+  out << "    <system-out><![CDATA[" << std::endl;
+  std::stringstream ss;
+  print_result_pretty_(ss, result, false);
+  out << escape_xml_special_chars(ss.str()) << std::endl;
+  out << "    ]]></system-out>" << std::endl;
+
+  out << "  </testcase>" << std::endl;
+
+  return out;  
+
+}
+
+std::ostream & PerformanceReport::print_junit_footer_(std::ostream &out) {
+
+  out << "</testsuite>" << std::endl;
+  return out;
+
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace profiler
diff --git a/tools/profiler/src/performance_report.h b/tools/profiler/src/performance_report.h
index 1c086e6185..5005103158 100644
--- a/tools/profiler/src/performance_report.h
+++ b/tools/profiler/src/performance_report.h
@@ -59,6 +59,12 @@ class PerformanceReport {
   /// Output file containing results
   std::ofstream output_file_;
 
+  /// Operation file name containing junit performance report of op_kind
+  std::string op_junit_file_name_;
+
+  /// Output file containing junit results
+  std::ofstream junit_output_file_;
+
   /// Flag indicating the performance report is valid
   bool good_;
 
@@ -74,6 +80,7 @@ class PerformanceReport {
 public:
 
   PerformanceReport(Options const &options, std::vector<std::string> const &argument_names, library::OperationKind const &op_kind);
+  ~PerformanceReport();
 
   bool good() const { return good_; }
 
@@ -81,8 +88,6 @@ class PerformanceReport {
   void append_result(PerformanceResult result);
   void append_results(PerformanceResultVector const &results);
 
-  void close();
-
 public:
 
   /// Prints the CSV header
@@ -91,10 +96,21 @@ class PerformanceReport {
   /// Prints the CSV
   std::ostream & print_result_csv_(std::ostream &out, PerformanceResult const &result);
 
+  /// @defgroup jUnit Result Generation
+  /// Functions related to generation of the jUnit results
+  /// @{
+
+  std::ostream & print_junit_header_(std::ostream &out);
+  std::ostream & print_junit_result_(std::ostream &out, PerformanceResult const &result);
+  std::ostream & print_junit_footer_(std::ostream &out);
+
+  /// @}
+
   /// Prints the result in human readable form
   std::ostream & print_result_pretty_(
     std::ostream &out, 
-    PerformanceResult const &result);
+    PerformanceResult const &result,
+    bool use_shell_coloring = true);
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/problem_space.cpp b/tools/profiler/src/problem_space.cpp
index e69b0110e9..a8c4943218 100644
--- a/tools/profiler/src/problem_space.cpp
+++ b/tools/profiler/src/problem_space.cpp
@@ -961,6 +961,85 @@ bool arg_as_SplitKModeID(
 
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ConvModeID(
+  library::ConvModeID &conv_mode,
+  KernelArgument::Value const *value_ptr) {
+
+  if (value_ptr->not_null) {
+    if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) {
+
+      conv_mode = library::from_string<library::ConvModeID>(
+        static_cast<EnumeratedTypeArgument::EnumeratedTypeValue const *>(value_ptr)->element);
+
+      if (conv_mode == library::ConvModeID::kInvalid) {
+        throw std::runtime_error(
+          "arg_as_ConvModeID() - illegal cast.");
+      }
+    }
+    else {
+
+      throw std::runtime_error(
+        "arg_as_ConvModeID() - illegal cast.");
+    }
+    return true;
+  }
+  return false;
+}
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ConvModeID(
+  library::ConvModeID &conv_mode,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem) {
+
+  size_t idx = problem_space.argument_index(name);
+  KernelArgument::Value const *value_ptr = problem.at(idx).get();
+
+  return arg_as_ConvModeID(conv_mode, value_ptr);
+}
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ProviderID(
+  library::Provider &provider,
+  KernelArgument::Value const *value_ptr) {
+
+  if (value_ptr->not_null) {
+    if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) {
+
+      provider = library::from_string<library::Provider>(
+        static_cast<EnumeratedTypeArgument::EnumeratedTypeValue const *>(value_ptr)->element);
+
+      if (provider == library::Provider::kInvalid) {
+        throw std::runtime_error(
+          "arg_as_ProviderID() - illegal cast.");
+      }
+    }
+    else {
+
+      throw std::runtime_error(
+        "arg_as_ProviderID() - illegal cast.");
+    }
+    return true;
+  }
+  return false;
+}
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ProviderID(
+  library::Provider &provider,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem) {
+
+  size_t idx = problem_space.argument_index(name);
+  KernelArgument::Value const *value_ptr = problem.at(idx).get();
+
+  return arg_as_ProviderID(provider, value_ptr);
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Lexically casts an argument to a given type stored in a byte array. Returns true if not null.
 bool arg_as_scalar(
   std::vector<uint8_t> &bytes,
@@ -1049,9 +1128,94 @@ bool tensor_description_satisfies(
   return false;
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if conv_kind satisfies the value
+bool conv_kind_satisfies(
+  library::ConvKind const &conv_kind,
+  EnumeratedTypeArgument::EnumeratedTypeValue const *value_ptr) {
+
+  if (value_ptr->not_null) {
+    library::ConvKind conv_kind_cmd_line = 
+      library::from_string<library::ConvKind>(value_ptr->element);
+
+    if (conv_kind_cmd_line != library::ConvKind::kUnknown && 
+      conv_kind_cmd_line != conv_kind) {
+
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/// Returns true if conv_kind satisfies the value
+bool conv_kind_satisfies(
+  library::ConvKind const &conv_kind,
+  char const *name, 
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem) {
+
+  size_t idx = problem_space.argument_index(name);
+  KernelArgument::Value const *value_ptr = problem.at(idx).get();
+
+  if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) {
+    return conv_kind_satisfies(
+      conv_kind, 
+      static_cast<EnumeratedTypeArgument::EnumeratedTypeValue const *>(value_ptr));
+  }
+  else {
+    throw std::runtime_error("Kernel argument mismatch");
+  }
+
+  return false;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if a iterator algorithm satisfies the value
+bool iterator_algorithm_satisfies(
+  library::IteratorAlgorithmID const &iterator_algorithm,
+  EnumeratedTypeArgument::EnumeratedTypeValue const *value_ptr) {
+
+  if (value_ptr->not_null) {
+    library::IteratorAlgorithmID iterator_algorithm_cmd_line = 
+      library::from_string<library::IteratorAlgorithmID>(value_ptr->element);
+
+    if (iterator_algorithm_cmd_line != library::IteratorAlgorithmID::kNone && 
+      iterator_algorithm_cmd_line != iterator_algorithm) {
+
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/// Returns true if a iterator algorithm satisfies the value
+bool iterator_algorithm_satisfies(
+  library::IteratorAlgorithmID const &iterator_algorithm,
+  char const *name, 
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem) {
+
+  size_t idx = problem_space.argument_index(name);
+  KernelArgument::Value const *value_ptr = problem.at(idx).get();
+
+  if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) {
+    return iterator_algorithm_satisfies(
+      iterator_algorithm, 
+      static_cast<EnumeratedTypeArgument::EnumeratedTypeValue const *>(value_ptr));
+  }
+  else {
+    throw std::runtime_error("Kernel argument mismatch");
+  }
+
+  return false;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 } // namespace profiler
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/tools/profiler/src/problem_space.h b/tools/profiler/src/problem_space.h
index 8a9ee4f2e8..8e10dbafce 100644
--- a/tools/profiler/src/problem_space.h
+++ b/tools/profiler/src/problem_space.h
@@ -909,6 +909,37 @@ bool arg_as_SplitKModeID(
   ProblemSpace const &problem_space, 
   ProblemSpace::Problem const &problem);
 
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ConvModeID(library::ConvModeID &conv_mode, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ConvModeID(
+  library::ConvModeID &conv_mode,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_IteratorAlgorithmID(library::IteratorAlgorithmID &iterator_algorithm, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_IteratorAlgorithmID(
+  library::IteratorAlgorithmID &iterator_algorithm,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ProviderID(library::Provider &provider, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ProviderID(
+  library::Provider &provider,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
 /// Lexically casts an argument to a given type stored in a byte array. Returns true if not null.
 bool arg_as_scalar(
   std::vector<uint8_t> &bytes,
@@ -935,10 +966,34 @@ bool tensor_description_satisfies(
   ProblemSpace const &problem_space, 
   ProblemSpace::Problem const &problem);
 
+
+/// Returns true if a conv kind satisfies the value
+bool conv_kind_satisfies(
+  library::ConvKind const &conv_kind,
+  EnumeratedTypeArgument::EnumeratedTypeValue const *value_ptr);
+
+/// Returns true if a conv kind satisfies the value
+bool conv_kind_satisfies(
+  library::ConvKind const &conv_kind,
+  char const *name, 
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+/// Returns true if a iterator algorithm satisfies the value
+bool iterator_algorithm_satisfies(
+  library::IteratorAlgorithmID const &iterator_algorithm,
+  EnumeratedTypeArgument::EnumeratedTypeValue const *value_ptr);
+
+/// Returns true if a iterator algorithm satisfies the value
+bool iterator_algorithm_satisfies(
+  library::IteratorAlgorithmID const &iterator_algorithm,
+  char const *name, 
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace profiler
 } // namespace cutlass
 
 ////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/tools/profiler/src/reduction_operation_profiler.h b/tools/profiler/src/reduction_operation_profiler.h
new file mode 100644
index 0000000000..e00dcc0b60
--- /dev/null
+++ b/tools/profiler/src/reduction_operation_profiler.h
@@ -0,0 +1,167 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines profiling functionality for reduction operation
+
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+#if CUTLASS_ENABLE_CUDNN
+#include "cudnn_helpers.h"
+#endif //#if CUTLASS_ENABLE_CUDNN
+#include "debug.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class ReductionOperationProfiler : public OperationProfiler {
+public:
+
+
+  /// Workspace used 
+  struct ReductionWorkspace {
+
+    /// Conv device allocations
+    DeviceAllocation *Workspace;
+    DeviceAllocation *Source;
+    DeviceAllocation *Destination;
+    DeviceAllocation *Reference;
+    
+    /// Library configuration and arguments
+    library::ReductionConfiguration configuration;
+    library::ReductionArguments arguments;
+
+    /// Buffer used for the cutlass operations' host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the cutlass operations' device workspace
+    DeviceAllocation device_workspace;
+
+    //
+    // Methods
+    //
+
+    ReductionWorkspace(): 
+      Workspace(nullptr), Source(nullptr), Destination(nullptr), Reference(nullptr) { }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// Reduction problem obtained from problem space
+  MatrixCoord problem_;
+
+  /// Device memory allocations 
+  ReductionWorkspace conv_workspace_;
+
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  ReductionOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~ReductionOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.cu b/tools/profiler/src/sparse_gemm_operation_profiler.cu
index 702b79bb6c..7eff2062b0 100644
--- a/tools/profiler/src/sparse_gemm_operation_profiler.cu
+++ b/tools/profiler/src/sparse_gemm_operation_profiler.cu
@@ -227,6 +227,9 @@ void SparseGemmOperationProfiler::SparseGemmProblem::initialize_result(
   set_argument(result, "C", problem_space,
     std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
 
+  set_argument(result, "E", problem_space,
+    std::string(library::to_string(operation_desc.E.element)) + ":" + library::to_string(operation_desc.E.layout));
+
   set_argument(result, "m", problem_space, m);
   set_argument(result, "n", problem_space, n);
   set_argument(result, "k", problem_space, k);
diff --git a/tools/util/include/cutlass/util/host_reorder.h b/tools/util/include/cutlass/util/host_reorder.h
index 1d12add3ef..660ee0f956 100644
--- a/tools/util/include/cutlass/util/host_reorder.h
+++ b/tools/util/include/cutlass/util/host_reorder.h
@@ -62,6 +62,18 @@ void reorder_column(TensorRef<Element, Layout> dest,
   }
 }
 
+template <int Interleaved, typename Element, typename Layout>
+void reorder_convK(TensorRef<Element, Layout> dest,
+                    TensorRef<Element, Layout> src,
+                    cutlass::gemm::GemmCoord problem_size) {
+
+    TensorRef<Element, layout::RowMajorInterleaved<Interleaved>> mappedDest(dest.data(), dest.stride(0));
+    TensorRef<Element, layout::RowMajorInterleaved<Interleaved>> mappedSrc(src.data(), src.stride(0));
+    
+    reorder_column<Interleaved>(
+        mappedDest, mappedSrc, problem_size);
+}
+
 /// This is needed for the sparse tensor core kernels.  The purpose
 /// is to use ldmatrix to load from shared memory to the register file.
 template <typename Element, typename LayoutDest, typename LayoutSrc>
diff --git a/tools/util/include/cutlass/util/reference/device/convolution.h b/tools/util/include/cutlass/util/reference/device/convolution.h
new file mode 100644
index 0000000000..843b6b15b9
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/device/convolution.h
@@ -0,0 +1,1536 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Reference implementation for convolution in device-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+///                                   Conv2d device reference kernel
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2d Fprop kernel - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv2dFprop(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int64_t npq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+
+  int thread_n[kThreadM];
+  int thread_p[kThreadM];
+  int thread_q[kThreadM];
+
+  // Compute N, P, Q coordinates for each row of a thread's tile
+  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+
+    int64_t npq = npq_start + m;
+
+    thread_n[m] = int(npq / PQ);
+    
+    int64_t residual = npq % PQ;
+    thread_p[m] = int(residual / problem_size.Q);
+    thread_q[m] = int(residual % problem_size.Q);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  // Compute convolution
+  for (int R = 0; R < problem_size.R; ++R) {
+    for (int S = 0; S < problem_size.S; ++S) {
+      for (int C = 0; C < problem_size.C; ++C) {
+
+        // Load from activations tensor
+        int filter_r = R;
+        int filter_s = S;   
+
+        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+          filter_r = problem_size.R - 1 - R;
+          filter_s = problem_size.S - 1 - S;
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+          int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+          if (thread_n[m] < problem_size.N && h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
+            element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], h, w, C}));
+          }
+          else {
+            element_A[m] = ElementAccumulator();
+          }
+        }
+
+        // Load from filters tensor
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < kThreadN; ++n) {
+          int thread_k = k_start + n;
+
+          if (thread_k < problem_size.K) {
+            element_B[n] = ElementAccumulator(tensor_w.at({thread_k, R, S, C}));
+          }
+          else {
+            element_B[n] = ElementAccumulator();
+          }
+        }
+
+        // Accumulate matrix product
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+          }
+        }
+      }
+    }
+  }
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    if (thread_n[m] < problem_size.N && thread_p[m] < problem_size.P && thread_q[m] < problem_size.Q) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_k = k_start + n;
+        if (thread_k < problem_size.K) {
+
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
+          }
+
+          tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } 
+    }
+  }
+}
+
+// Conv3d Fprop kernel - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator =  ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv3dFprop(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int64_t nzpq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+
+  int thread_n[kThreadM];
+  int thread_z[kThreadM];
+  int thread_p[kThreadM];
+  int thread_q[kThreadM];
+
+  // Compute N, Z, P, Q coordinates for each row of a thread's tile
+  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
+  int64_t ZPQ = PQ * problem_size.Z;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+
+    int64_t nzpq = nzpq_start + m;
+
+    thread_n[m] = int(nzpq / ZPQ);
+    
+    int64_t residual = nzpq % ZPQ;
+    thread_z[m] = int(residual / PQ);
+
+    residual = residual % PQ;
+    thread_p[m] = int(residual / problem_size.Q);
+    thread_q[m] = int(residual % problem_size.Q);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  // Compute convolution
+  for (int T = 0; T < problem_size.T; ++T) {
+    for (int R = 0; R < problem_size.R; ++R) {
+      for (int S = 0; S < problem_size.S; ++S) {
+        for (int C = 0; C < problem_size.C; ++C) {
+
+          // Load from activations tensor
+          int filter_t = T;
+          int filter_r = R;
+          int filter_s = S;   
+
+          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+            filter_t = problem_size.T - 1 - R;
+            filter_r = problem_size.R - 1 - R;
+            filter_s = problem_size.S - 1 - S;
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            int d = thread_z[m] * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
+            int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+            int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+            if (thread_n[m] < problem_size.N && 
+              d >= 0 && d < problem_size.D && 
+              h >= 0 && h < problem_size.H && 
+              w >= 0 && w < problem_size.W) {
+
+              element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], d, h, w, C}));
+            }
+            else {
+              element_A[m] = ElementAccumulator();
+            }
+          }
+
+          // Load from filters tensor
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            int thread_k = k_start + n;
+
+            if (thread_k < problem_size.K) {
+              element_B[n] = ElementAccumulator(tensor_w.at({thread_k, T, R, S, C}));
+            }
+            else {
+              element_B[n] = ElementAccumulator();
+            }
+          }
+
+          // Accumulate matrix product
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < kThreadN; ++n) {
+              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+            }
+          }
+
+        } // for (C)
+      } // for (S)
+    }  // for (R) 
+  } // for (T)
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+
+    if (thread_n[m] < problem_size.N && 
+      thread_z[m] < problem_size.Z && 
+      thread_p[m] < problem_size.P && 
+      thread_q[m] < problem_size.Q) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_k = k_start + n;
+        if (thread_k < problem_size.K) {
+
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}));
+          }
+
+          tensor_y_out.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } // for (n)
+ 
+    }
+  } // for (m)
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2d dgrad kernel - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv2dDgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int64_t nhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+
+  int thread_n[kThreadM];
+  int thread_h[kThreadM];
+  int thread_w[kThreadM];
+
+  // Compute N, H, W coordinates for each row of a thread's tile
+  int64_t HW = int64_t(problem_size.H) * problem_size.W;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+
+    int64_t nhw = nhw_start + m;
+
+    thread_n[m] = int(nhw / HW);
+    
+    int64_t residual = nhw % HW;
+    thread_h[m] = int(residual / problem_size.W);
+    thread_w[m] = int(residual % problem_size.W);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  // Compute convolution
+  for (int R = 0; R < problem_size.R; ++R) {
+    for (int S = 0; S < problem_size.S; ++S) {
+      for (int K = 0; K < problem_size.K; ++K) {
+
+        // Load from activations tensor
+        int filter_r = R;
+        int filter_s = S;   
+
+        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+          filter_r = problem_size.R - 1 - R;
+          filter_s = problem_size.S - 1 - S;
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+
+          int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
+          int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
+
+          element_A[m] = ElementAccumulator();
+
+          if (p >= 0 && !(p % problem_size.stride_h) && q >= 0 && !(q % problem_size.stride_w)) {
+
+            p = p / problem_size.stride_h;
+            q = q / problem_size.stride_w;
+
+            if (thread_n[m] < problem_size.N && p < problem_size.P && q < problem_size.Q) {
+              element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], p, q, K}));  
+            }
+          }
+        }
+
+        // Load from filters tensor
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < kThreadN; ++n) {
+          int thread_c = c_start + n;
+
+          if (thread_c < problem_size.C) {
+            element_B[n] = ElementAccumulator(tensor_w.at({K, R, S, thread_c}));
+          }
+          else {
+            element_B[n] = ElementAccumulator();
+          }
+        }
+
+        // Accumulate matrix product
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+          }
+        }
+      }
+    }
+  }
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    
+    if (thread_n[m] < problem_size.N && thread_h[m] < problem_size.H && thread_w[m] < problem_size.W) {
+      
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_c = c_start + n;
+        if (thread_c < problem_size.C) {
+
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_h[m], thread_w[m], thread_c}));
+          }
+
+          tensor_dx_out.at({thread_n[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } 
+    }
+  }
+}
+
+// Conv3d dgrad kernel - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv3dDgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int64_t ndhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+
+  int thread_n[kThreadM];
+  int thread_d[kThreadM];
+  int thread_h[kThreadM];
+  int thread_w[kThreadM];
+
+  // Compute N, H, W coordinates for each row of a thread's tile
+  int64_t HW = int64_t(problem_size.H) * problem_size.W;
+  int64_t DHW = HW * problem_size.D;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+
+    int64_t ndhw = ndhw_start + m;
+
+    thread_n[m] = int(ndhw / DHW);
+    
+    int64_t residual = ndhw % DHW;
+    thread_d[m] = int(residual / HW);
+
+    residual = residual % HW;
+    thread_h[m] = int(residual / problem_size.W);
+    thread_w[m] = int(residual % problem_size.W);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  // Compute convolution
+  for (int T = 0; T < problem_size.T; ++T) {
+    for (int R = 0; R < problem_size.R; ++R) {
+      for (int S = 0; S < problem_size.S; ++S) {
+        for (int K = 0; K < problem_size.K; ++K) {
+
+          // Load from activations tensor
+          int filter_t = T;
+          int filter_r = R;
+          int filter_s = S;   
+
+          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+            filter_t = problem_size.T - 1 - T;
+            filter_r = problem_size.R - 1 - R;
+            filter_s = problem_size.S - 1 - S;
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+
+            int z = thread_d[m] + problem_size.pad_d - filter_t * problem_size.dilation_d;
+            int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
+            int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
+
+            element_A[m] = ElementAccumulator();
+
+            if (z >= 0 && !(z % problem_size.stride_d) && 
+              p >= 0 && !(p % problem_size.stride_h) && 
+              q >= 0 && !(q % problem_size.stride_w)) {
+
+              z = z / problem_size.stride_d;
+              p = p / problem_size.stride_h;
+              q = q / problem_size.stride_w;
+
+              if (thread_n[m] < problem_size.N && z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
+                element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], z, p, q, K}));  
+              }
+            }
+          }
+
+          // Load from filters tensor
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            int thread_c = c_start + n;
+
+            if (thread_c < problem_size.C) {
+              element_B[n] = ElementAccumulator(tensor_w.at({K, T, R, S, thread_c}));
+            }
+            else {
+              element_B[n] = ElementAccumulator();
+            }
+          }
+
+          // Accumulate matrix product
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < kThreadN; ++n) {
+              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+            }
+          }
+
+        } // for (C)
+      } // for (S)
+    } // for (R)
+  } // for (T)
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    
+    if (thread_n[m] < problem_size.N && 
+      thread_d[m] < problem_size.D && 
+      thread_h[m] < problem_size.H && 
+      thread_w[m] < problem_size.W) {
+      
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_c = c_start + n;
+        if (thread_c < problem_size.C) {
+
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}));
+          }
+
+          tensor_dx_out.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } 
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2d wgrad kernel - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
+  int kCtaShapeN = 16     // shape of a threadblock in units of threads
+>
+__global__ void Conv2dWgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int64_t rsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+  
+  int thread_r[kThreadN];
+  int thread_s[kThreadN];
+  int thread_c[kThreadN];
+
+  // Compute R, S, C coordinates for each row of a thread's tile
+  int64_t SC = int64_t(problem_size.S) * problem_size.C;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int n = 0; n < kThreadN; ++n) {
+
+    int64_t rsc = rsc_start + n;
+    int64_t residual = rsc % SC;
+
+    thread_r[n] = int(rsc / SC);
+    thread_s[n] = int(residual / problem_size.C);
+    thread_c[n] = int(residual % problem_size.C);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  // Compute convolution
+  for (int N = 0; N < problem_size.N; ++N) {
+    for (int P = 0; P < problem_size.P; ++P) {
+      for (int Q = 0; Q < problem_size.Q; ++Q) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          int thread_k = k_start + m;
+
+          element_A[m] = ElementAccumulator();
+
+          if (thread_k < problem_size.K) {
+            element_A[m] = ElementAccumulator(tensor_dy.at({N, P, Q, thread_k}));
+          }
+        }
+
+        // Load from filters tensor
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < kThreadN; ++n) {
+          
+          // Load from activations tensor
+          int filter_r = thread_r[n];
+          int filter_s = thread_s[n];
+
+          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+            filter_r = problem_size.R - 1 - filter_r;
+            filter_s = problem_size.S - 1 - filter_s;
+          }
+
+          int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+          int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+          element_B[n] = ElementAccumulator();
+
+          if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W && thread_c[n] < problem_size.C) {
+            element_B[n] = ElementAccumulator(tensor_x.at({N, h, w, thread_c[n]}));
+          }
+        }
+
+        // Accumulate matrix product
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+          }
+        }
+      }
+    }
+  }
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    int thread_k = k_start + m;
+
+    if (thread_k < problem_size.K) {
+      
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+
+        if (thread_r[n] < problem_size.R && thread_s[n] < problem_size.S && thread_c[n] < problem_size.C) {
+
+          ElementCompute c_ref = ElementCompute();
+
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}));
+          }
+
+          tensor_dw_out.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } 
+    }
+  }
+}
+
+// Conv3d wgrad kernel - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
+  int kCtaShapeN = 16     // shape of a threadblock in units of threads
+>
+__global__ void Conv3dWgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int64_t trsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+  
+  int thread_t[kThreadN];
+  int thread_r[kThreadN];
+  int thread_s[kThreadN];
+  int thread_c[kThreadN];
+
+  // Compute R, S, C coordinates for each row of a thread's tile
+  int64_t SC = int64_t(problem_size.S) * problem_size.C;
+  int64_t RSC = SC * problem_size.R;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int n = 0; n < kThreadN; ++n) {
+
+    int64_t trsc = trsc_start + n;
+
+    thread_t[n] = int(trsc / RSC);
+
+    int64_t residual = trsc % RSC;
+    thread_r[n] = int(residual / SC);
+
+    residual = residual % SC; 
+    thread_s[n] = int(residual / problem_size.C);
+    thread_c[n] = int(residual % problem_size.C);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  // Compute convolution
+  for (int N = 0; N < problem_size.N; ++N) {
+    for (int Z = 0; Z < problem_size.Z; ++Z) {
+      for (int P = 0; P < problem_size.P; ++P) {
+        for (int Q = 0; Q < problem_size.Q; ++Q) {
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            int thread_k = k_start + m;
+
+            element_A[m] = ElementAccumulator();
+
+            if (thread_k < problem_size.K) {
+              element_A[m] = ElementAccumulator(tensor_dy.at({N, Z, P, Q, thread_k}));
+            }
+          }
+
+          // Load from filters tensor
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            
+            // Load from activations tensor
+            int filter_t = thread_t[n];
+            int filter_r = thread_r[n];
+            int filter_s = thread_s[n];
+
+            if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+              filter_t = problem_size.T - 1 - filter_t;
+              filter_r = problem_size.R - 1 - filter_r;
+              filter_s = problem_size.S - 1 - filter_s;
+            }
+
+            int d = Z * problem_size.stride_d - problem_size.pad_w + filter_t * problem_size.dilation_d;
+            int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+            int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+            element_B[n] = ElementAccumulator();
+
+            if (d >= 0 && d < problem_size.D && 
+              h >= 0 && h < problem_size.H && 
+              w >= 0 && w < problem_size.W && 
+              thread_c[n] < problem_size.C) {
+
+              element_B[n] = ElementAccumulator(tensor_x.at({N, d, h, w, thread_c[n]}));
+            }
+          }
+
+          // Accumulate matrix product
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < kThreadN; ++n) {
+              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+            }
+          }
+
+        } // for (Q)
+      } // for (P)
+    } // for (Z)
+  } // for (N)
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    int thread_k = k_start + m;
+
+    if (thread_k < problem_size.K) {
+      
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+
+        if (thread_t[n] < problem_size.T && 
+          thread_r[n] < problem_size.R &&
+          thread_s[n] < problem_size.S && 
+          thread_c[n] < problem_size.C) {
+
+          ElementCompute c_ref = ElementCompute();
+
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}));
+          }
+
+          tensor_dw_out.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } 
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Conv2d Fprop dispatcher - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2dFprop(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+
+  int64_t npq = int64_t(problem_size.N) * problem_size.P * problem_size.Q;
+  int64_t blocks_m = (npq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+
+  kernel::Conv2dFprop<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_x,
+    tensor_w,
+    tensor_y_in,
+    tensor_y_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/// Conv3d Fprop dispatcher - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3dFprop(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+
+  int64_t nzpq = int64_t(problem_size.N) * problem_size.Z * problem_size.P * problem_size.Q;
+  int64_t blocks_m = (nzpq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+
+  kernel::Conv3dFprop<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_x,
+    tensor_w,
+    tensor_y_in,
+    tensor_y_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/// Conv2d Dgrad dispatcher - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2dDgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+
+  int64_t nhw = int64_t(problem_size.N) * problem_size.H * problem_size.W;
+  int64_t blocks_m = (nhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+
+  kernel::Conv2dDgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_w,
+    tensor_dx_in,
+    tensor_dx_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/// Conv3d Dgrad dispatcher - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3dDgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+
+  int64_t ndhw = int64_t(problem_size.N) * problem_size.D * problem_size.H * problem_size.W;
+  int64_t blocks_m = (ndhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+
+  kernel::Conv3dDgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_w,
+    tensor_dx_in,
+    tensor_dx_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/// Conv2d Wgrad dispatcher - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2dWgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
+  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
+
+  int64_t rsc = int64_t(problem_size.R) * problem_size.S * problem_size.C;
+  int64_t blocks_n = (rsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
+
+  kernel::Conv2dWgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_x,
+    tensor_dw_in,
+    tensor_dw_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/// Conv3d Wgrad dispatcher - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3dWgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
+  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
+
+  int64_t trsc = int64_t(problem_size.T) * problem_size.R * problem_size.S * problem_size.C;
+  int64_t blocks_n = (trsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
+
+  kernel::Conv3dWgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_x,
+    tensor_dw_in,
+    tensor_dw_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2d(
+  conv::Operator convolutional_operator,
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    return Conv2dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+    break;
+
+  case conv::Operator::kDgrad:
+    return Conv2dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+    break;
+
+  case conv::Operator::kWgrad:
+    return Conv2dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+    break;
+
+  default: break;
+  }
+  
+  return Status::kErrorNotSupported;
+}
+
+/// Generic 3D convolution targeting Conv3dFprop, Conv3dDgrad, and Conv3dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3d(
+  conv::Operator convolutional_operator,
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    return Conv3dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+
+  case conv::Operator::kDgrad:
+    return Conv3dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+
+  case conv::Operator::kWgrad:
+    return Conv3dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+
+  default: break;
+  }
+  
+  return Status::kErrorNotSupported;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace reference
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/util/include/cutlass/util/reference/host/convolution.h b/tools/util/include/cutlass/util/reference/host/convolution.h
new file mode 100644
index 0000000000..48f5db81ea
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/convolution.h
@@ -0,0 +1,767 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Reference implementation for convolution in host-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Forward propagation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// y = conv2d(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dFprop(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int p = 0; p < problem_size.P; ++p) {
+      for (int q = 0; q < problem_size.Q; ++q) {
+        for (int k = 0; k < problem_size.K; ++k) {
+
+          ElementAccumulator acc = ElementAccumulator();
+
+          for (int r = 0; r < problem_size.R; ++r) {
+            for (int s = 0; s < problem_size.S; ++s) {
+              for (int c = 0; c < problem_size.C; ++c) {
+
+                int filter_r = r;
+                int filter_s = s;
+
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+
+                int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+                int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+                if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
+
+                  ElementA a = tensor_x.at({n, h, w, c});
+                  ElementB b = tensor_w.at({k, r, s, c});
+
+                  acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+
+                }
+              }
+            }
+          }
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+
+          if (beta != ElementCompute()) {
+            c_ref = tensor_y_in.at(cutlass::make_Coord(n, p, q, k));
+          }
+
+          tensor_y_out.at(cutlass::make_Coord(n, p, q, k)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+        }
+      }
+    }
+  }
+}
+
+/// Depthwise-separable convolution
+template <typename ElementA,
+          typename LayoutA,
+          typename ElementB,
+          typename LayoutB,
+          typename ElementC,
+          typename LayoutC,
+          typename ElementAccumulator,
+          typename ElementCompute,
+          typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+          typename InnerProductOp = multiply_add<ElementAccumulator> >
+void Depsep_Fprop(
+                  cutlass::TensorView<ElementA, LayoutA> tensor_A,
+                  cutlass::TensorView<ElementB, LayoutB> tensor_B,
+                  cutlass::TensorView<ElementC, LayoutC> tensor_C,
+                  ElementCompute alpha,
+                  ElementCompute beta,
+                  cutlass::Tensor4DCoord padding,
+                  cutlass::Coord<2> conv_stride,
+                  cutlass::Coord<2> dilation,
+                  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation) {
+  
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < tensor_C.extent().n(); ++n) {
+    for (int p = 0; p < tensor_C.extent().h(); ++p) {
+      for (int q = 0; q < tensor_C.extent().w(); ++q) {
+        for (int g = 0; g < tensor_C.extent().c(); ++g) {
+          ElementAccumulator acc = ElementAccumulator();
+          for (int r = 0; r < tensor_B.extent().h(); ++r) {
+            for (int s = 0; s < tensor_B.extent().w(); ++s) {
+              if ((p * conv_stride[0] - padding[0] + r * dilation[0]) < tensor_A.extent().h() &&
+                  (p * conv_stride[0] - padding[0] + r * dilation[0]) >= 0 &&
+                  (q * conv_stride[1] - padding[2] + s * dilation[1]) < tensor_A.extent().w() &&
+                  (q * conv_stride[1] - padding[2] + s * dilation[1]) >= 0) {
+                ElementA a = tensor_A.at(
+                    cutlass::make_Coord(n,
+                                        p * conv_stride[0] - padding[0] + r * dilation[0],
+                                        q * conv_stride[1] - padding[2] + s * dilation[1],
+                                        g));
+
+                ElementB b = (mode == cutlass::conv::Mode::kCrossCorrelation)
+                                   ? tensor_B.at(cutlass::make_Coord(g, r, s, 0))
+                                   : tensor_B.at(cutlass::make_Coord(
+                                         g, tensor_B.extent().h() - r - 1, tensor_B.extent().w() - s - 1, 0));
+
+                acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+              }
+            }
+          }
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = tensor_C.at(cutlass::make_Coord(n, p, q, g));
+          tensor_C.at(cutlass::make_Coord(n, p, q, g)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dDgrad(
+  cutlass::conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int h = 0; h < problem_size.H; ++h) {
+      for (int w = 0; w < problem_size.W; ++w) {
+        for (int c = 0; c < problem_size.C; ++c) {
+
+          ElementAccumulator acc = ElementAccumulator();
+
+          for (int r = 0; r < problem_size.R; ++r) {
+            for (int s = 0; s < problem_size.S; ++s) {
+              for (int k = 0; k < problem_size.K; ++k) {
+
+                int filter_r = r;
+                int filter_s = s;
+
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+
+                int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
+                int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
+
+                if (p >= 0 && (p % problem_size.stride_h) == 0 && 
+                    q >= 0 && (q % problem_size.stride_w) == 0) {
+
+                  p = p / problem_size.stride_h;
+                  q = q / problem_size.stride_w;
+                  
+                  if (p < problem_size.P && q < problem_size.Q) {
+
+                    ElementA a = tensor_dy.at(cutlass::make_Coord(n, p, q, k));
+                    ElementB b = tensor_w.at(cutlass::make_Coord(k, r, s, c));
+
+                    acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                  }
+                }
+
+              } // for (K)
+            } // for (S)
+          } // for (R)
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+
+          if (beta != ElementCompute()) {
+            c_ref = tensor_dx_in.at(cutlass::make_Coord(n, h, w, c));
+          }
+
+          tensor_dx_out.at(cutlass::make_Coord(n, h, w, c)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+        } // for (C)
+      } // for (W)
+    } // for (H)
+  } // for (N)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Wgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dWgrad(
+  cutlass::conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  
+  InnerProductOp inner_product_op;
+  ConvertOp convert_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int k = 0; k < problem_size.K; ++k) {
+    for (int r = 0; r < problem_size.R; ++r) {
+      for (int s = 0; s < problem_size.S; ++s) {
+        for (int c = 0; c < problem_size.C; ++c) {
+
+          ElementAccumulator acc = ElementAccumulator();
+
+          for (int n = 0; n < problem_size.N; ++n) {
+            for (int p = 0; p < problem_size.P; ++p) {
+              for (int q = 0; q < problem_size.Q; ++q) {
+                  
+                cutlass::Tensor4DCoord b_coord;
+                
+                int filter_r = r;
+                int filter_s = s; 
+
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+
+                b_coord = make_Coord(
+                    n,
+                    p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
+                    q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
+                    c);
+
+                if (b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
+                    b_coord.w() < problem_size.W && b_coord.w() >= 0) {
+
+                  ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, p, q, k)));
+                  ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
+                  acc = inner_product_op(a, b, acc);
+                }
+              }
+            }
+          }
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+
+          if (beta != ElementCompute()) {
+            c_ref = tensor_dw_in.at(cutlass::make_Coord(k, r, s, c));
+          }
+
+          tensor_dw_out.at(cutlass::make_Coord(k, r, s, c)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+        } // for (C)
+      } // for (S)
+    } // for (R)
+  } // for (K)
+}
+
+/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2d(
+  conv::Operator convolutional_operator,
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    Conv2dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  case conv::Operator::kDgrad:
+    Conv2dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  case conv::Operator::kWgrad:
+    Conv2dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  default:
+    break;  
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// 3D convolution 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// y = conv3d(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dFprop(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int z = 0; z < problem_size.Z; ++z) {
+      for (int p = 0; p < problem_size.P; ++p) {
+        for (int q = 0; q < problem_size.Q; ++q) {
+          for (int k = 0; k < problem_size.K; ++k) {
+
+            ElementAccumulator acc = ElementAccumulator();
+
+            for (int t = 0; t < problem_size.T; ++t) {
+              for (int r = 0; r < problem_size.R; ++r) {
+                for (int s = 0; s < problem_size.S; ++s) {
+                  for (int c = 0; c < problem_size.C; ++c) {
+
+                    int filter_t = t;
+                    int filter_r = r;
+                    int filter_s = s;
+
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+
+                    int d = z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
+                    int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+                    int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+                    if (d >= 0 && d < problem_size.D && 
+                      h >=0 && h < problem_size.H && 
+                      w >= 0 && w < problem_size.W) {
+
+                      ElementA a = tensor_x.at({n, d, h, w, c});
+                      ElementB b = tensor_w.at({k, t, r, s, c});
+                      
+                      acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                    }
+                  }
+                }
+              }
+            }
+
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+
+            if (beta != ElementCompute()) {
+              c_ref = tensor_y_in.at(cutlass::make_Coord(n, z, p, q, k));
+            }
+
+            tensor_y_out.at(cutlass::make_Coord(n, z, p, q, k)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dDgrad(
+  cutlass::conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int d = 0; d < problem_size.D; ++d) {
+      for (int h = 0; h < problem_size.H; ++h) {
+        for (int w = 0; w < problem_size.W; ++w) {
+          for (int c = 0; c < problem_size.C; ++c) {
+
+            ElementAccumulator acc = ElementAccumulator();
+
+            for (int t = 0; t < problem_size.T; ++t) {
+              for (int r = 0; r < problem_size.R; ++r) {
+                for (int s = 0; s < problem_size.S; ++s) {
+                  for (int k = 0; k < problem_size.K; ++k) {
+
+                    int filter_t = t;
+                    int filter_r = r;
+                    int filter_s = s;
+
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+
+                    int z = d + problem_size.pad_d - filter_t * problem_size.dilation_d;
+                    int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
+                    int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
+
+                    if (z >= 0 && (z % problem_size.stride_d) == 0 &&
+                        p >= 0 && (p % problem_size.stride_h) == 0 && 
+                        q >= 0 && (q % problem_size.stride_w) == 0) {
+
+                      z = z / problem_size.stride_d;
+                      p = p / problem_size.stride_h;
+                      q = q / problem_size.stride_w;
+                      
+                      if (z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
+
+                        ElementA a = tensor_dy.at(cutlass::make_Coord(n, z, p, q, k));
+                        ElementB b = tensor_w.at(cutlass::make_Coord(k, t, r, s, c));
+
+                        acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                      }
+                    }
+
+                  } // for (K)
+                } // for (S)
+              } // for (R)
+            } // for (T)
+
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+
+            if (beta != ElementCompute()) {
+              c_ref = tensor_dx_in.at(cutlass::make_Coord(n, d, h, w, c));
+            }
+
+            tensor_dx_out.at(cutlass::make_Coord(n, d, h, w, c)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+          } // for (C)
+        } // for (W)
+      } // for (H)
+    } // for (D)
+  } // for (N)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Wgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dWgrad(
+  cutlass::conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  
+  InnerProductOp inner_product_op;
+  ConvertOp convert_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int k = 0; k < problem_size.K; ++k) {
+    for (int t = 0; t < problem_size.T; ++t) {
+      for (int r = 0; r < problem_size.R; ++r) {
+        for (int s = 0; s < problem_size.S; ++s) {
+          for (int c = 0; c < problem_size.C; ++c) {
+
+            ElementAccumulator acc = ElementAccumulator();
+
+            for (int n = 0; n < problem_size.N; ++n) {
+              for (int z = 0; z < problem_size.Z; ++z) {
+                for (int p = 0; p < problem_size.P; ++p) {
+                  for (int q = 0; q < problem_size.Q; ++q) {
+                      
+                    int filter_t = t;     
+                    int filter_r = r;
+                    int filter_s = s; 
+
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+
+                    Tensor5DCoord b_coord = make_Coord(
+                        n,
+                        z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d,
+                        p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
+                        q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
+                        c);
+
+                    if (b_coord.d() < problem_size.D && b_coord.d() >= 0 &&
+                        b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
+                        b_coord.w() < problem_size.W && b_coord.w() >= 0) {
+
+                      ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, z, p, q, k)));
+                      ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
+
+                      acc = inner_product_op(a, b, acc);
+                    }
+                  }
+                }
+              }
+            }
+
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+
+            if (beta != ElementCompute()) {
+              c_ref = tensor_dw_in.at(cutlass::make_Coord(k, t, r, s, c));
+            }
+
+            tensor_dw_out.at(cutlass::make_Coord(k, t, r, s, c)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+          } // for (C)
+        } // for (S)
+      } // for (R)
+    } // for (T)
+  } // for (K)
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Generic 3D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3d(
+  conv::Operator convolutional_operator,
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    Conv3dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  case conv::Operator::kDgrad:
+    Conv3dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  case conv::Operator::kWgrad:
+    Conv3dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  default:
+    break;  
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace host
+}  // namespace reference
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/util/include/cutlass/util/reference/host/gemm.h b/tools/util/include/cutlass/util/reference/host/gemm.h
index 98db6dcd95..6381aa3066 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm.h
@@ -249,6 +249,45 @@ struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAddFastBF16> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Partial specialization for multiply-add-saturate
 template <typename ElementA, typename LayoutA, typename ElementB,
           typename LayoutB, typename ElementC, typename LayoutC,